def describe(df, bins, corr_reject, config, **kwargs): if not isinstance(df, SparkDataFrame): raise TypeError("df must be of type pyspark.sql.DataFrame") # Number of rows: table_stats = {"n": df.count()} if table_stats["n"] == 0: raise ValueError("df cannot be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "spark_df_profiling.mplstyle")) # Function to "pretty name" floats: def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x # Function to compute the correlation matrix: def corr_matrix(df, columns=None): if columns is None: columns = df.columns combinations = list(product(columns,columns)) def separate(l, n): for i in range(0, len(l), n): yield l[i:i+n] grouped = list(separate(combinations,len(columns))) df_cleaned = df.select(*columns).na.drop(how="any") for i in grouped: for j in enumerate(i): i[j[0]] = i[j[0]] + (df_cleaned.corr(str(j[1][0]), str(j[1][1])),) df_pandas = pd.DataFrame(grouped).applymap(lambda x: x[2]) df_pandas.columns = columns df_pandas.index = columns return df_pandas # Compute histogram (is not as easy as it looks): def create_hist_data(df, column, minim, maxim, bins=10): def create_all_conditions(current_col, column, left_edges, count=1): """ Recursive function that exploits the ability to call the Spark SQL Column method .when() in a recursive way. """ left_edges = left_edges[:] if len(left_edges) == 0: return current_col if len(left_edges) == 1: next_col = current_col.when(col(column) >= float(left_edges[0]), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) next_col = current_col.when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) num_range = maxim - minim bin_width = num_range / float(bins) left_edges = [minim] for _bin in range(bins): left_edges = left_edges + [left_edges[-1] + bin_width] left_edges.pop() expression_col = when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), 0) left_edges_copy = left_edges[:] left_edges_copy.pop(0) bin_data = (df.select(col(column)) .na.drop() .select(col(column), create_all_conditions(expression_col, column, left_edges_copy ).alias("bin_id") ) .groupBy("bin_id").count() ).toPandas() # If no data goes into one bin, it won't # appear in bin_data; so we should fill # in the blanks: bin_data.index = bin_data["bin_id"] new_index = list(range(bins)) bin_data = bin_data.reindex(new_index) bin_data["bin_id"] = bin_data.index bin_data = bin_data.fillna(0) # We add the left edges and bin width: bin_data["left_edge"] = left_edges bin_data["width"] = bin_width return bin_data def mini_histogram(histogram_data): # Small histogram imgdata = BytesIO() hist_data = histogram_data figure = plt.figure(figsize=(2, 0.75)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.axes.get_yaxis().set_visible(False) plot.set_facecolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_integer_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats def describe_date_1d(df, column): stats_df = df.select(column).na.drop().agg(df_min(col(column)).alias("min"), df_max(col(column)).alias("max") ).toPandas() stats = stats_df.ix[0].copy() stats.name = column # Convert Pandas timestamp object to regular datetime: if isinstance(stats["max"], pd.tslib.Timestamp): stats = stats.astype(object) stats["max"] = str(stats["max"].to_pydatetime()) stats["min"] = str(stats["min"].to_pydatetime()) # Range only got when type is date else: stats["range"] = stats["max"] - stats["min"] stats["type"] = "DATE" return stats def guess_json_type(string_value): try: obj = json.loads(string_value) except: return None return type(obj) def describe_categorical_1d(df, column): value_counts = (df.select(column).na.drop() .groupBy(column) .agg(count(col(column))) .orderBy("count({c})".format(c=column),ascending=False) ).cache() # Get the most frequent class: stats = (value_counts .limit(1) .withColumnRenamed(column, "top") .withColumnRenamed("count({c})".format(c=column), "freq") ).toPandas().ix[0] # Get the top 50 classes by value count, # and put the rest of them grouped at the # end of the Series: top_50 = value_counts.limit(50).toPandas().sort_values("count({c})".format(c=column), ascending=False) top_50_categories = top_50[column].values.tolist() others_count = pd.Series([df.select(column).na.drop() .where(~(col(column).isin(*top_50_categories))) .count() ], index=["***Other Values***"]) others_distinct_count = pd.Series([value_counts .where(~(col(column).isin(*top_50_categories))) .count() ], index=["***Other Values Distinct Count***"]) top = top_50.set_index(column)["count({c})".format(c=column)] top = top.append(others_count) top = top.append(others_distinct_count) stats["value_counts"] = top stats["type"] = "CAT" value_counts.unpersist() unparsed_valid_jsons = df.select(column).na.drop().rdd.map( lambda x: guess_json_type(x[column])).filter( lambda x: x).distinct().collect() stats["unparsed_json_types"] = unparsed_valid_jsons return stats def describe_constant_1d(df, column): stats = pd.Series(['CONST'], index=['type'], name=column) stats["value_counts"] = (df.select(column) .na.drop() .limit(1)).toPandas().ix[:,0].value_counts() return stats def describe_unique_1d(df, column): stats = pd.Series(['UNIQUE'], index=['type'], name=column) stats["value_counts"] = (df.select(column) .na.drop() .limit(50)).toPandas().ix[:,0].value_counts() return stats def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type)) distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas() non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas() results_data = pd.concat([distinct_count, non_nan_count],axis=1) results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.ix[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append(describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result # Do the thing: ldesc = {} for colum in df.columns: if colum in config: if 'lookup' in config[colum]: lookup_config = config[colum]['lookup'] desc = describe_1d(df, colum, table_stats["n"], lookup_config=lookup_config) else: desc = describe_1d(df, colum, table_stats["n"]) else: desc = describe_1d(df, colum, table_stats["n"]) ldesc.update({colum: desc}) # Compute correlation matrix if corr_reject is not None: computable_corrs = [colum for colum in ldesc if ldesc[colum]["type"] in {"NUM"}] if len(computable_corrs) > 0: corr = corr_matrix(df, columns=computable_corrs) for x, corr_x in corr.iterrows(): for y, corr in corr_x.iteritems(): if x == y: break if corr >= corr_reject: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame variable_stats = pd.DataFrame(ldesc) # General statistics table_stats["nvar"] = len(df.columns) table_stats["total_missing"] = float(variable_stats.ix["n_missing"].sum()) / (table_stats["n"] * table_stats["nvar"]) memsize = 0 table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] freq_dict = {} for var in variable_stats: if "value_counts" not in variable_stats[var]: pass elif not(variable_stats[var]["value_counts"] is np.nan): freq_dict[var] = variable_stats[var]["value_counts"] else: pass try: variable_stats = variable_stats.drop("value_counts") except (ValueError, KeyError): pass return {'table': table_stats, 'variables': variable_stats.T, 'freq': freq_dict}
def describe(df, bins, corr_reject, config, **kwargs): if not isinstance(df, SparkDataFrame): raise TypeError("df must be of type pyspark.sql.DataFrame") # Number of rows: table_stats = {"n": df.count()} if table_stats["n"] == 0: raise ValueError("df cannot be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "spark_df_profiling.mplstyle")) # Function to "pretty name" floats: def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x # Function to compute the correlation matrix: def corr_matrix(df, columns=None): if columns is None: columns = df.columns col_combinations = combinations(columns, 2) df_cleaned = df.select(*columns).na.drop(how="any") corr_result = pd.DataFrame(np.eye(len(columns))) corr_result.columns = columns corr_result.index = columns for i, j in col_combinations: corr_result[i][j] = corr_result[j][i] = df_cleaned.corr(str(i), str(j)) return corr_result # Compute histogram (is not as easy as it looks): def create_hist_data(df, column, minim, maxim, bins=10): def create_all_conditions(current_col, column, left_edges, count=1): """ Recursive function that exploits the ability to call the Spark SQL Column method .when() in a recursive way. """ left_edges = left_edges[:] if len(left_edges) == 0: return current_col if len(left_edges) == 1: next_col = current_col.when(col(column) >= float(left_edges[0]), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) next_col = current_col.when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) num_range = maxim - minim bin_width = num_range / float(bins) left_edges = [minim] for _bin in range(bins): left_edges = left_edges + [left_edges[-1] + bin_width] left_edges.pop() expression_col = when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), 0) left_edges_copy = left_edges[:] left_edges_copy.pop(0) bin_data = (df.select(col(column)) .na.drop() .select(col(column), create_all_conditions(expression_col, column, left_edges_copy ).alias("bin_id") ) .groupBy("bin_id").count() ).toPandas() # If no data goes into one bin, it won't # appear in bin_data; so we should fill # in the blanks: bin_data.index = bin_data["bin_id"] new_index = list(range(bins)) bin_data = bin_data.reindex(new_index) bin_data["bin_id"] = bin_data.index bin_data = bin_data.fillna(0) # We add the left edges and bin width: bin_data["left_edge"] = left_edges bin_data["width"] = bin_width return bin_data def mini_histogram(histogram_data): # Small histogram imgdata = BytesIO() hist_data = histogram_data figure = plt.figure(figsize=(2, 0.75)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.axes.get_yaxis().set_visible(False) plot.set_facecolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_integer_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] for x in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().iloc[:,0] ) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] for x in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().iloc[:,0] ) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats def describe_date_1d(df, column): stats_df = df.select(column).na.drop().agg(df_min(col(column)).alias("min"), df_max(col(column)).alias("max") ).toPandas() stats = stats_df.iloc[0].copy() stats.name = column # Convert Pandas timestamp object to regular datetime: if isinstance(stats["max"], pd.Timestamp): stats = stats.astype(object) stats["max"] = str(stats["max"].to_pydatetime()) stats["min"] = str(stats["min"].to_pydatetime()) # Range only got when type is date else: stats["range"] = stats["max"] - stats["min"] stats["type"] = "DATE" return stats def guess_json_type(string_value): try: obj = json.loads(string_value) except: return None return type(obj) def describe_categorical_1d(df, column): count_column_name = "count({c})".format(c=column) value_counts = (df.select(column).na.drop() .groupBy(column) .agg(count(col(column))) .orderBy(count_column_name, ascending=False) ).cache() # Get the top 50 classes by value count, # and put the rest of them grouped at the # end of the Series: top_50 = value_counts.limit(50).toPandas().sort_values(count_column_name, ascending=False) stats = top_50.take([0]).rename(columns={column: 'top', count_column_name: 'freq'}).iloc[0] others_count = 0 others_distinct_count = 0 unique_categories_count = value_counts.count() if unique_categories_count > 50: others_count = value_counts.select(df_sum(count_column_name)).toPandas().iloc[0, 0] - top_50[count_column_name].sum() others_distinct_count = unique_categories_count - 50 value_counts.unpersist() top = top_50.set_index(column)[count_column_name] top["***Other Values***"] = others_count top["***Other Values Distinct Count***"] = others_distinct_count stats["value_counts"] = top stats["type"] = "CAT" unparsed_valid_jsons = df.select(column).na.drop().rdd.map( lambda x: guess_json_type(x[column])).filter( lambda x: x).distinct().collect() stats["unparsed_json_types"] = unparsed_valid_jsons return stats def describe_constant_1d(df, column): stats = pd.Series(['CONST'], index=['type'], name=column) stats["value_counts"] = (df.select(column) .na.drop() .limit(1)).toPandas().iloc[:,0].value_counts() return stats def describe_unique_1d(df, column): stats = pd.Series(['UNIQUE'], index=['type'], name=column) stats["value_counts"] = (df.select(column) .na.drop() .limit(50)).toPandas().iloc[:,0].value_counts() return stats def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type)) results_data = df.select(countDistinct(col(column)).alias("distinct_count"), count(col(column).isNotNull()).alias('count')).toPandas() results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.iloc[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append(describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result # Do the thing: ldesc = {} for colum in df.columns: if colum in config: if 'lookup' in config[colum]: lookup_config = config[colum]['lookup'] desc = describe_1d(df, colum, table_stats["n"], lookup_config=lookup_config) else: desc = describe_1d(df, colum, table_stats["n"]) else: desc = describe_1d(df, colum, table_stats["n"]) ldesc.update({colum: desc}) # Compute correlation matrix if corr_reject is not None: computable_corrs = [colum for colum in ldesc if ldesc[colum]["type"] in {"NUM"}] if len(computable_corrs) > 0: corr = corr_matrix(df, columns=computable_corrs) for x, corr_x in corr.iterrows(): for y, corr in corr_x.iteritems(): if x == y: break if corr >= corr_reject: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame variable_stats = pd.DataFrame(ldesc) # General statistics table_stats["nvar"] = len(df.columns) table_stats["total_missing"] = float(variable_stats.loc["n_missing"].sum()) / (table_stats["n"] * table_stats["nvar"]) memsize = 0 table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] freq_dict = {} for var in variable_stats: if "value_counts" not in variable_stats[var]: pass elif not(variable_stats[var]["value_counts"] is np.nan): freq_dict[var] = variable_stats[var]["value_counts"] else: pass try: variable_stats = variable_stats.drop("value_counts") except (ValueError, KeyError): pass return {'table': table_stats, 'variables': variable_stats.T, 'freq': freq_dict}
def describe(df, bins=10, corr_reject=0.9, **kwargs): if not isinstance(df, SparkDataFrame): raise TypeError('df must be of type pyspark.sql.DataFrame') # Number of rows: table_stats = {'n': df.count()} if table_stats['n'] == 0: raise ValueError('df cannot be empty') try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use('default') except: pass matplotlib.style.use( resource_filename(__name__, 'spark_df_profiling.mplstyle')) # Data profiling k_vals, t_freq = kwargs.get('k_vals') or {}, kwargs.get('t_freq') or {} ldesc = { column: describe_1d(df, bins, column, table_stats['n'], k_vals.get(column, 2), t_freq.get(column, 'D')) for column in df.columns } # Compute correlation matrix if corr_reject is not None: computable_corrs = [ column for column in ldesc if ldesc[column]['type'] == 'NUM' ] if len(computable_corrs) > 0: corr = corr_matrix(df, columns=computable_corrs) for x, corr_x in corr.iterrows(): for y, corr in corr_x.iteritems(): if x == y: break if corr >= corr_reject: ldesc[x] = pd.Series( ['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame variable_stats = pd.DataFrame(ldesc) # General statistics table_stats['nvar'] = len(df.columns) table_stats['total_missing'] = float( variable_stats.ix['n_missing'].sum()) / (table_stats['n'] * table_stats['nvar']) table_stats['accuracy_idx'] = 1 - ( (variable_stats.ix['high_idx'] + variable_stats.ix['low_idx']) / variable_stats.ix['count']).mean(skipna=True) memsize = 0 table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update( {k: 0 for k in ('NUM', 'DATE', 'CONST', 'CAT', 'UNIQUE', 'CORR')}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] freq_dict = {} for var in variable_stats: if 'value_counts' not in variable_stats[var]: pass elif variable_stats[var]['value_counts'] is not np.nan: freq_dict[var] = variable_stats[var]['value_counts'] else: pass try: variable_stats = variable_stats.drop('value_counts') except ValueError: pass return { 'table': table_stats, 'variables': variable_stats.T, 'freq': freq_dict }