def to_json(self): """ Return a json from a Spark Dataframe :param self: :return: """ return collect_as_dict(self.collect())
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") expr.append( F.count( F.when( F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(collect_as_dict(df.select(*expr).collect())) return result
def glom(self): """ :param self: Dataframe :return: """ return collect_as_dict(self.rdd.glom().collect()[0])
def to_dict(self): """ Return a Python object from a Spark Dataframe :param self: :return: """ return collect_as_dict(self)
def hist(columns, min_value, max_value, buckets=10): """ Get the histogram column in json format :param columns: Columns to be processed :param min_value: Min value used to calculate the buckets :param max_value: Max value used to calculate the buckets :param buckets: Number of buckets :return: """ columns = parse_columns(self, columns) for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) # Create buckets in the dataFrame df = bucketizer(self, col_name, splits=splits) counts = (collect_as_dict( df.groupBy(col_name + "_buckets").agg( F.count(col_name + "_buckets").alias("count")).cols.rename( col_name + "_buckets", "value").sort(F.asc("value")).collect())) hist = [] for x, y in zip(counts, splits): # if x["value"] is not None and x["count"] != 0: hist.append({ "lower": y["lower"], "upper": y["upper"], "value": x["count"] }) return hist
def to_json(self): """ Return a json from a Spark Dataframe :param self: :return: """ return json.dumps(collect_as_dict(self), ensure_ascii=False, default=json_converter)
def table(self, limit=100, columns=None): """ Return a HTML table with the dataframe cols, data types and values :param self: :param columns: Columns to be printed :param limit: how many rows will be printed :return: """ columns = parse_columns(self, columns) data = collect_as_dict(self.select(columns).limit(limit).collect()) # Load template path = os.path.dirname(os.path.abspath(__file__)) template_loader = jinja2.FileSystemLoader(searchpath=path + "//../templates") template_env = jinja2.Environment(loader=template_loader, autoescape=True) template = template_env.get_template("table.html") # Filter only the columns and data type need it dtypes = list(filter(lambda x: x[0] in columns, self.dtypes)) total = self.count() if total < limit: limit = total # Print table output = template.render(cols=dtypes, data=data, limit=limit, total=total) display(HTML(output))
def _exprs(funcs, columns): """ Helper function to apply multiple columns expression to multiple columns :param funcs: Aggregation functions from Apache Spark :param columns: list or string of columns names or a . :return: """ def parse_col_names_funcs_to_keys(data): """ Helper function that return a formatted json with function:value inside columns. Transform from {'max_antiguedad_anos': 15, 'max_m2_superficie_construida': 1800000, 'min_antiguedad_anos': 2, 'min_m2_superficie_construida': 20} to {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}} :param data: json data :return: json """ functions_array = [ "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum", "variance", "approx_count_distinct", "na", "zeros", "percentile" ] result = {} if is_dict(data): for k, v in data.items(): for f in functions_array: temp_func_name = f + "_" if k.startswith(temp_func_name): _col_name = k[len(temp_func_name):] result.setdefault(_col_name, {})[f] = v return result else: return data columns = parse_columns(self, columns) # Ensure that is a list funcs = val_to_list(funcs) df = self # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving # unexpected results # df = df.cols.cast(columns, "float") # Create a Column Expression for every column exprs = [] for col_name in columns: for func in funcs: exprs.append( func(col_name).alias(func.__name__ + "_" + col_name)) return (parse_col_names_funcs_to_keys( format_dict(collect_as_dict(df.agg(*exprs).collect()))))
def count_zeros(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self return format_dict(collect_as_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]) \ .collect()))
def frequency(columns, buckets=10): """ Output values frequency in json format :param columns: Column to be processed :param buckets: Number of buckets :return: """ columns = parse_columns(self, columns) df = self for col_name in columns: df = df.groupBy(col_name).count().rows.sort([ ("count", "desc"), (col_name, "desc") ]).limit(buckets).cols.rename(col_name, "value") return collect_as_dict(df.collect())
def columns(df, columns, buckets=10): """ Return statistical information about a specific column in json format count_data_type() :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: :return: json object with the """ columns = parse_columns(df, columns) # Get just a sample to infer the column data type # sample_size_number = sample_size(rows_count, 95.0, 2.0) # fraction = sample_size_number / rows_count # sample = df.sample(False, fraction, seed=1) # Initialize Objects column_info = {} column_info['columns'] = {} rows_count = df.count() column_info['rows_count'] = rows_count count_dtypes = Profiler.count_data_types(df, columns) column_info["count_types"] = count_dtypes["count_types"] column_info['size'] = human_readable_bytes(df.size()) def na(col_name): return F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)) def zeros(col_name): return F.count(F.when(F.col(col_name) == 0, col_name)) # Cast every column to a specific type to ensure the correct profiling # For example if we calculate the min or max of a string column with numeric values the result will be incorrect for col_name in columns: dtype = count_dtypes["columns"][col_name]['dtype'] # Not force date type conversion, we can not trust that is going to be representative if dtype in ["string", "float", "int", "bool"]: df = df.cols.cast(col_name, dtype) stats = df.cols._exprs( [F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, na, zeros], columns) for col_name in columns: logging.info("Processing column '" + col_name + "'...") col_info = {} col_info["stats"] = {} column_info['columns'][col_name] = {} column_type = count_dtypes["columns"][col_name]['type'] col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] na = stats[col_name]["na"] max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] col_info['name'] = col_name col_info['column_type'] = column_type # Numeric Column if column_type == "numeric" or column_type == "date": # Merge col_info["stats"] = stats[col_name] # Missing col_info['stats']['missing_count'] = round(na, 2) col_info['stats']['p_missing'] = round(na / rows_count * 100, 2) col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] if column_type == "categorical" or column_type == "numeric" or column_type == "date" or column_type == "bool": # Frequency col_info['frequency'] = collect_as_dict(df.groupBy(col_name) .count() .rows.sort([("count", "desc"), (col_name, "desc")]) .limit(10) .withColumn("percentage", F.round((F.col("count") / rows_count) * 100, 3)) .cols.rename(col_name, "value") .collect()) # Uniques uniques = stats[col_name].pop("approx_count_distinct") col_info['stats']["uniques_count"] = uniques col_info['stats']["p_uniques"] = round(uniques / rows_count * 100, 3) if column_type == "numeric": # Additional Stats # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass # https: // stackoverflow.com / questions / 45287832 / pyspark - approxquantile - function max_value = fast_float(max_value) min_value = fast_float(min_value) col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95]) col_info['stats']['range'] = max_value - min_value col_info['stats']['median'] = col_info['stats']['quantile'][0.5] col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \ col_info['stats']['quantile'][0.25] col_info['stats']['coef_variation'] = round((col_info['stats']['stddev'] / col_info['stats']['mean']), 5) col_info['stats']['mad'] = round(df.cols.mad(col_name), 5) col_info["hist"] = df.cols.hist(col_name, min_value, max_value, buckets) column_info['columns'][col_name] = col_info return column_info
def table_html(self, limit=10, columns=None, title=None, full=False, truncate=True): """ Return a HTML table with the dataframe cols, data types and values :param self: :param columns: Columns to be printed :param limit: How many rows will be printed :param title: Table title :param full: Include html header and footer :param truncate: Truncate the row information :return: """ columns = parse_columns(self, columns) if limit is None: limit = 10 if limit == "all": data = collect_as_dict(self.cols.select(columns)) else: data = collect_as_dict(self.cols.select(columns).limit(limit)) # Load the Jinja template template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) template = template_env.get_template("table.html") # Filter only the columns and data type info need it dtypes = [] for i, j in zip(self.dtypes, self.schema): if i[1].startswith("array<struct"): dtype = "array<struct>" elif i[1].startswith("struct"): dtype = "struct" else: dtype = i[1] dtypes.append((i[0], dtype, j.nullable)) # Remove not selected columns final_columns = [] for i in dtypes: for j in columns: if i[0] == j: final_columns.append(i) total_rows = self.rows.approx_count() if limit == "all": limit = total_rows elif total_rows < limit: limit = total_rows total_rows = humanize.intword(total_rows) total_cols = self.cols.count() total_partitions = self.partitions() output = template.render(cols=final_columns, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols, partitions=total_partitions, title=title, truncate=truncate) if full is True: output = HEADER + output + FOOTER return output
def to_json(self): return collect_as_dict(self.collect())
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) # Parse dtype if col_data_type == "smallint" or col_data_type == "tinyint": col_data_type = "int" elif col_data_type == "float" or col_data_type == "double": col_data_type = "decimal" elif col_data_type.find("array") >= 0: col_data_type = "array" count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": logger.print("Processing column '" + col_name + "'...") types = collect_as_dict(df .h_repartition(col_name=col_name) .withColumn(temp, fbdt(col_name, get_type=True)) .groupBy(temp).count() ) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: # if boolean not support count na if "count_na" in stats[col_name]: nulls = stats[col_name]["count_na"] count_by_data_type[col_data_type] = int(df_count) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get) if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": cat = "categorical" elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": cat = "numeric" elif greatest_data_type_count == "date": cat = "date" elif greatest_data_type_count == "array": cat = "array" elif greatest_data_type_count == "binary": cat = "binary" elif greatest_data_type_count == "null": cat = "null" else: cat = None col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**count_by_data_type, **null_missed_count} return col