def generate_html_detail_text(feature_dict: dict, compare_dict: dict, dataframe_report): template = jinja2_env.get_template('feature_detail_text.html') # Set some parameters for detail columns # ------------------------------------ cols = dict() # Cols: Move text if there is a comparison pair display cur_x = config["Layout"].getint("pair_spacing") padding = config["Layout"].getint("col_spacing") if compare_dict is not None: cols["compare"] = cur_x cur_x = cur_x + config["Layout"].getint("pair_spacing") cur_x = cur_x + padding cols["text"] = cur_x cols["text_width"] = config["Layout"].getint( "detail_text_max_width") - cur_x cols["full_text_width"] = config["Layout"].getint("detail_text_max_width") max_text_rows = config["Detail_Stats"].getint("detail_max_text_rows") # Filter final row list to display, add "other" # ------------------------------------ full_list = feature_dict["detail"]["full_count"] feature_dict["detail"]["detail_count"] = full_list[:max_text_rows] detail_list = feature_dict["detail"]["detail_count"] # Clipping text only for memory purposes (display will be handled by the browser) max_text_display_length = config["Summary_Stats"].getint( "text_max_string_len") for elem in detail_list: elem["name"] = elem["name"][:max_text_display_length] # Add "others" if len(detail_list) == max_text_rows: total = feature_dict["base_stats"]["num_values"].number cur_count = sum(row_data["count"].number for row_data in detail_list) other = total - cur_count row = dict() row["name"] = OTHERS_GROUPED.strip() row["count"] = NumWithPercent(other, total) row["target_stats"] = None row["target_stats_compare"] = None if compare_dict is not None: total = compare_dict["base_stats"]["num_values"].number cur_count = sum( \ (row_data["count_compare"].number if row_data.get("count_compare") else 0) \ for row_data in detail_list) other = total - cur_count row["count_compare"] = NumWithPercent(other, total) else: row["count_compare"] = None if row["count"].number > 0 or (row.get("count_compare") and row.get("count_compare").number > 0): detail_list.append(row) output = template.render(feature_dict = feature_dict, compare_dict = compare_dict, \ cols=cols) return output
def do_detail_numeric(series: pd.Series, counts: dict, counts_compare: dict, updated_dict: dict): updated_dict["detail"] = dict() detail = updated_dict["detail"] total_num = float(updated_dict["base_stats"]["num_values"]) num_to_show = config["Detail_Stats"].getint("max_num_numeric_top_values") detail["frequent_values"] = list() detail["min_values"] = list() detail["max_values"] = list() frequent_values = pd.DataFrame( counts["value_counts_without_nan"].head(num_to_show)) min_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \ ascending=True).head(num_to_show)) max_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \ ascending=False)).head(num_to_show) if counts_compare is not None: this_compare_count = counts_compare["value_counts_without_nan"] compare_total_num = float( updated_dict["compare"]["base_stats"]["num_values"]) else: this_compare_count = None for frequent, min_value, max_value in zip(frequent_values.itertuples(), \ min_values.itertuples(), max_values.itertuples()): def get_comparison_num(feature_name): this_comparison = None if this_compare_count is not None: try: this_comparison = this_compare_count.get(feature_name) except TypeError: # Workaround for cases where source dataset has ints only, but compare has floats... pass #...this was incorrect as it could have created false matches: # if this_compare_count.index.dtype.name.find('int') != -1: # this_comparison = this_compare_count.get(np.int64(feature_name)) # else: # this_comparison = None if this_comparison is not None: this_comparison = NumWithPercent(this_comparison, compare_total_num) else: # If there is a comparison array but no matching value, insert 0 # ("none" is the absence of value) this_comparison = NumWithPercent(0, compare_total_num) return this_comparison detail["frequent_values"].append( (frequent[0], NumWithPercent(frequent[1], total_num), get_comparison_num(frequent[0]))) detail["min_values"].append( (min_value[0], NumWithPercent(min_value[1], total_num), get_comparison_num(min_value[0]))) detail["max_values"].append( (max_value[0], NumWithPercent(max_value[1], total_num), get_comparison_num(max_value[0])))
def get_comparison_num(feature_name): this_comparison = None if this_compare_count is not None: this_comparison = this_compare_count.get(feature_name) if this_comparison is not None: this_comparison = NumWithPercent(this_comparison, compare_total_num) else: # If there is a comparison array but no matching value, insert 0 # ("none" is the absence of value) this_comparison = NumWithPercent(0, compare_total_num) return this_comparison
def add_series_base_stats_to_dict(series: pd.Series, counts: dict, updated_dict: dict) -> dict: updated_dict["stats"] = dict() updated_dict["base_stats"] = dict() base_stats = updated_dict["base_stats"] num_total = counts["num_rows_total"] num_zeros = series[series == 0].sum() non_nan = counts["num_rows_with_data"] base_stats["total_rows"] = num_total base_stats["num_values"] = NumWithPercent(non_nan, num_total) base_stats["num_missing"] = NumWithPercent(num_total - non_nan, num_total) base_stats["num_zeroes"] = NumWithPercent(num_zeros, num_total) base_stats["num_distinct"] = NumWithPercent(counts["distinct_count_without_nan"], num_total)
def do_detail_numeric(series: pd.Series, counts: dict, counts_compare: dict, updated_dict: dict): updated_dict["detail"] = dict() detail = updated_dict["detail"] total_num = float(updated_dict["base_stats"]["num_values"]) num_to_show = config["Detail_Stats"].getint("max_num_numeric_top_values") detail["frequent_values"] = list() detail["min_values"] = list() detail["max_values"] = list() frequent_values = pd.DataFrame( counts["value_counts_without_nan"].head(num_to_show)) min_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \ ascending=True).head(num_to_show)) max_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \ ascending=False)).head(num_to_show) if counts_compare is not None: this_compare_count = counts_compare["value_counts_without_nan"] compare_total_num = float( updated_dict["compare"]["base_stats"]["num_values"]) else: this_compare_count = None for frequent, min_value, max_value in zip(frequent_values.itertuples(), \ min_values.itertuples(), max_values.itertuples()): def get_comparison_num(feature_name): this_comparison = None if this_compare_count is not None: this_comparison = this_compare_count.get(feature_name) if this_comparison is not None: this_comparison = NumWithPercent(this_comparison, compare_total_num) else: # If there is a comparison array but no matching value, insert 0 # ("none" is the absence of value) this_comparison = NumWithPercent(0, compare_total_num) return this_comparison detail["frequent_values"].append( (frequent[0], NumWithPercent(frequent[1], total_num), get_comparison_num(frequent[0]))) detail["min_values"].append( (min_value[0], NumWithPercent(min_value[1], total_num), get_comparison_num(min_value[0]))) detail["max_values"].append( (max_value[0], NumWithPercent(max_value[1], total_num), get_comparison_num(max_value[0])))
def summarize_dataframe(self, source: pd.DataFrame, name: str, target_dict: dict, skip: List[str]): target_dict["name"] = name target_dict["num_rows"] = len(source) target_dict["num_columns"] = len(source.columns) target_dict["num_skipped_columns"] = len(source.columns) - len([x for x in source.columns if x not in skip]) target_dict["memory_total"] = source.memory_usage(index=True, deep=True).sum() target_dict["memory_single_row"] = \ float(target_dict["memory_total"]) / target_dict["num_rows"] target_dict["duplicates"] = NumWithPercent(sum(source.duplicated()), len(source))
def get_comparison_num(feature_name): this_comparison = None if this_compare_count is not None: try: this_comparison = this_compare_count.get(feature_name) except TypeError: # Workaround for cases where source dataset has ints only, but compare has floats... pass #...this was incorrect as it could have created false matches: # if this_compare_count.index.dtype.name.find('int') != -1: # this_comparison = this_compare_count.get(np.int64(feature_name)) # else: # this_comparison = None if this_comparison is not None: this_comparison = NumWithPercent(this_comparison, compare_total_num) else: # If there is a comparison array but no matching value, insert 0 # ("none" is the absence of value) this_comparison = NumWithPercent(0, compare_total_num) return this_comparison
def do_detail_text(to_process: FeatureToProcess, updated_dict: dict): updated_dict["detail"] = dict() detail = updated_dict["detail"] # Compute COUNT stats (i.e. below graph) # ---------------------------------------------------------------------------------------------- detail["full_count"] = [] num_values = updated_dict["base_stats"]["num_values"].number if to_process.compare_counts is not None: num_values_compare = updated_dict["compare"]["base_stats"][ "num_values"].number # Iterate through ALL VALUES and get stats for item in to_process.source_counts["value_counts_without_nan"].iteritems( ): row = dict() row["name"] = html.escape(str(item[0])) row["count"] = NumWithPercent(item[1], num_values) # Defaults to no comparison or target row["count_compare"] = None row["target_stats"] = None row["target_stats_compare"] = None if to_process.compare_counts is not None: # HAS COMPARE... if row["name"] in to_process.compare_counts[ "value_counts_without_nan"].index: # ...and value exists in COMPARE matching = to_process.compare_counts[ "value_counts_without_nan"][row["name"]] row["count_compare"] = NumWithPercent(matching, num_values_compare) detail["full_count"].append(row) return
def do_detail_categorical(to_process: FeatureToProcess, updated_dict: dict): updated_dict["detail"] = dict() detail = updated_dict["detail"] # Compute COUNT stats (i.e. below graph) # ---------------------------------------------------------------------------------------------- detail["full_count"] = [] # To get percentages num_values = updated_dict["base_stats"]["num_values"].number if to_process.compare_counts is not None: num_values_compare = updated_dict["compare"]["base_stats"][ "num_values"].number category_counts = utils.get_clamped_value_counts(to_process.source_counts["value_counts_without_nan"], \ config["Graphs"].getint("detail_graph_max_categories")) # Iterate through ALL VALUES and get stats total_num_compare = 0 max_abs_value = 0 for item in category_counts.iteritems(): row = dict() row["name"] = item[0] row["count"] = NumWithPercent(item[1], num_values) # Defaults to no comparison or target row["count_compare"] = None row["target_stats"] = None row["target_stats_compare"] = None row["is_total"] = None if to_process.source_target is not None: # HAS TARGET # TODO: OPTIMIZE: CACHE FROM GRAPH? if row["name"] == OTHERS_GROUPED: this_value_target_only = to_process.source_target[ ~to_process.source.isin(category_counts.keys())] else: this_value_target_only = to_process.source_target[ to_process.source == row["name"]] if to_process.predetermined_type_target == FeatureType.TYPE_BOOL: count_this_value_target_only = float( this_value_target_only.count()) count_true = this_value_target_only.sum() row["target_stats"] = NumWithPercent( count_true, count_this_value_target_only) elif to_process.predetermined_type_target == FeatureType.TYPE_NUM: row["target_stats"] = NumWithPercent( this_value_target_only.mean(), 1.0) max_abs_value = max(max_abs_value, row["target_stats"].number) if to_process.compare_counts is not None: # HAS COMPARE... if row["name"] in to_process.compare_counts[ "value_counts_without_nan"].index: # ...and value exists in COMPARE matching = to_process.compare_counts[ "value_counts_without_nan"][row["name"]] row["count_compare"] = NumWithPercent(matching, num_values_compare) if to_process.compare_target is not None: # TODO: OPTIMIZE: CACHE FROM GRAPH? if row["name"] == OTHERS_GROUPED: this_value_target_only = to_process.compare_target[ ~to_process.compare.isin(category_counts.keys())] else: this_value_target_only = to_process.compare_target[ to_process.compare == row["name"]] # HAS COMPARE-TARGET if to_process.predetermined_type_target == FeatureType.TYPE_BOOL: count_this_value_target_only = float( this_value_target_only.count()) count_true = this_value_target_only.sum() row["target_stats_compare"] = NumWithPercent( count_true, count_this_value_target_only) elif to_process.predetermined_type_target == FeatureType.TYPE_NUM: row["target_stats_compare"] = NumWithPercent( this_value_target_only.mean(), 1.0) max_abs_value = max(max_abs_value, row["target_stats_compare"].number) detail["full_count"].append(row) detail["max_range"] = max_abs_value # "ALL" row # ----------------------------------------------- row = dict() row["name"] = "ALL" row["count"] = NumWithPercent(num_values, num_values) # Defaults to no comparison or target row["count_compare"] = None row["target_stats"] = None row["target_stats_compare"] = None row["is_total"] = True if to_process.source_target is not None: # HAS TARGET if to_process.predetermined_type_target == FeatureType.TYPE_BOOL: # TODO: OPTIMIZE: CACHE FROM GRAPH? count_this_value_target_only = float( to_process.source_target.count()) count_true = to_process.source_target.sum() row["target_stats"] = NumWithPercent(count_true, count_this_value_target_only) elif to_process.predetermined_type_target == FeatureType.TYPE_NUM: # TODO: OPTIMIZE: CACHE FROM GRAPH? row["target_stats"] = NumWithPercent( to_process.source_target.mean(), 1.0) if to_process.compare_counts is not None: row["count_compare"] = NumWithPercent(num_values_compare, num_values_compare) if to_process.compare_target is not None: # HAS COMPARE-TARGET if to_process.predetermined_type_target == FeatureType.TYPE_BOOL: # TODO: OPTIMIZE: CACHE FROM GRAPH? count_this_value_target_only = float( to_process.compare_target.count()) count_true = to_process.compare_target.sum() row["target_stats_compare"] = NumWithPercent( count_true, count_this_value_target_only) elif to_process.predetermined_type_target == FeatureType.TYPE_NUM: # TODO: OPTIMIZE: CACHE FROM GRAPH? row["target_stats_compare"] = NumWithPercent( to_process.compare_target.mean(), 1.0) detail["full_count"].append(row) return