def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict: # start = time.perf_counter() # Validation: Make sure the targets are the same length as the series if to_process.source_target is not None and to_process.source is not None: if len(to_process.source_target) != len(to_process.source): raise ValueError if to_process.compare_target is not None and to_process.compare is not None: if len(to_process.compare_target) != len(to_process.compare): raise ValueError # Initialize some dictionary values returned_feature_dict = dict() returned_feature_dict["name"] = to_process.source.name returned_feature_dict["order_index"] = to_process.order returned_feature_dict["is_target"] = True if to_process.order == -1 else False # Determine SOURCE feature type to_process.source_counts = get_counts(to_process.source) returned_feature_dict["type"] = determine_feature_type(to_process.source, to_process.source_counts, to_process.predetermined_type, "SOURCE") # Determine COMPARED feature type & initialize compare_dict = None if to_process.compare is not None: to_process.compare_counts = get_counts(to_process.compare) compare_type = determine_feature_type(to_process.compare, to_process.compare_counts, returned_feature_dict["type"], "COMPARED") # Explicitly show missing categories on each set if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL: fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts) fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts) returned_feature_dict["compare"] = dict() compare_dict = returned_feature_dict["compare"] compare_dict["type"] = compare_type # Establish base stats add_series_base_stats_to_dict(to_process.source, to_process.source_counts, returned_feature_dict) if to_process.compare is not None: add_series_base_stats_to_dict(to_process.compare, to_process.compare_counts, compare_dict) # Perform full analysis on source/compare/target if returned_feature_dict["type"] == FeatureType.TYPE_NUM: sweetviz.series_analyzer_numeric.analyze(to_process, returned_feature_dict) elif returned_feature_dict["type"] == FeatureType.TYPE_CAT: sweetviz.series_analyzer_cat.analyze(to_process, returned_feature_dict) elif returned_feature_dict["type"] == FeatureType.TYPE_BOOL: sweetviz.series_analyzer_cat.analyze(to_process, returned_feature_dict) elif returned_feature_dict["type"] == FeatureType.TYPE_TEXT: sweetviz.series_analyzer_text.analyze(to_process, returned_feature_dict) else: raise ValueError # print(f"{to_process.source.name} PROCESSED ------> " # f" {time.perf_counter() - start}") return returned_feature_dict
def analyze(to_process: FeatureToProcess, feature_dict: dict): compare_dict = feature_dict.get("compare") feature_dict["stats"] = dict() if compare_dict: compare_dict["stats"] = dict() do_detail_text(to_process, feature_dict) if to_process.is_target(): raise ValueError else: feature_dict["html_summary"] = sv_html.generate_html_summary_text( feature_dict, compare_dict)
def analyze(to_process: FeatureToProcess, feature_dict: dict): compare_dict = feature_dict.get("compare") feature_dict["stats"] = dict() if compare_dict: compare_dict["stats"] = dict() do_detail_categorical(to_process, feature_dict) feature_dict["minigraph"] = GraphCat("mini", to_process) feature_dict["detail_graphs"] = list() feature_dict["detail_graphs"].append(GraphCat("detail", to_process)) if to_process.is_target(): feature_dict[ "html_summary"] = sv_html.generate_html_summary_target_cat( feature_dict, compare_dict) else: feature_dict["html_summary"] = sv_html.generate_html_summary_cat( feature_dict, compare_dict) return
def analyze(to_process: FeatureToProcess, feature_dict: dict): do_stats_numeric(to_process.source, feature_dict) compare_dict = feature_dict.get("compare") if compare_dict: do_stats_numeric(to_process.compare, compare_dict) do_detail_numeric(to_process.source, to_process.source_counts, to_process.compare_counts, feature_dict) feature_dict["minigraph"] = GraphNumeric("mini", to_process) feature_dict["detail_graphs"] = list() for num_bins in [0, 5, 15, 30]: new_graph = GraphNumeric("detail-" + str(num_bins), to_process) if new_graph: feature_dict["detail_graphs"].append(new_graph) if to_process.is_target(): feature_dict[ "html_summary"] = sv_html.generate_html_summary_target_numeric( feature_dict, compare_dict) else: feature_dict["html_summary"] = sv_html.generate_html_summary_numeric( feature_dict, compare_dict)
def __init__(self, source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]], target_feature_name: str = None, compare: Union[pd.DataFrame, Tuple[pd.DataFrame, str]] = None, pairwise_analysis: str = 'auto', fc: FeatureConfig = None): pairwise_analysis = pairwise_analysis.lower() if pairwise_analysis not in ["on", "auto", "off"]: raise ValueError('"pairwise_analysis" parameter should be one of: "on", "auto", "off"') sv_html.load_layout_globals_from_config() self._jupyter_html = "" self._page_html = "" self._features = dict() self.compare_name = None self._target = None self.test_mode = False if fc is None: fc = FeatureConfig() # Associations: _associations[FEATURE][GIVES INFORMATION ABOUT THIS FEATURE] self._associations = dict() self._associations_compare = dict() self._association_graphs = dict() self._association_graphs_compare = dict() # Handle source and compare dataframes and names if type(source) == pd.DataFrame: source_df = source self.source_name = "DataFrame" elif type(source) == list or type(source) == tuple: if len(source) != 2: raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') source_df = source[0] self.source_name = source[1] else: raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') if len(su.get_duplicate_cols(source_df)) > 0: raise ValueError('Duplicate column names detected in "source"; this is not supported.') # NEW (12-14-2020): Rename indices that use the reserved name "index" # From pandas-profiling: # If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column} to be `df_index`. if 'index' in source_df.columns: source_df = source_df.rename(columns={"index": "df_index"}) if target_feature_name == 'index': target_feature_name = 'df_index' all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()] if compare is None: compare_df = None self.compare_name = None all_compare_names = list() elif type(compare) == pd.DataFrame: compare_df = compare if 'index' in compare_df.columns: compare_df = compare_df.rename(columns={"index": "df_index"}) self.compare_name = "Compared" all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()] elif type(compare) == list or type(compare) == tuple: if len(compare) != 2: raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') compare_df = compare[0] if 'index' in compare_df.columns: compare_df = compare_df.rename(columns={"index": "df_index"}) self.compare_name = compare[1] all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()] else: raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].') # Validate some params if compare_df is not None and len(su.get_duplicate_cols(compare_df)) > 0: raise ValueError('Duplicate column names detected in "compare"; this is not supported.') if target_feature_name in fc.skip: raise ValueError(f'"{target_feature_name}" was also specified as "skip". Target cannot be skipped.') for key in fc.get_all_mentioned_features(): if key not in all_source_names: raise ValueError(f'"{key}" was specified in "feature_config" but is not found in source dataframe (watch case-sensitivity?).') # Find Features and Target (FILTER SKIPPED) filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems() if cur_name not in fc.skip] for skipped in fc.skip: if skipped not in all_source_names and skipped not in all_compare_names: raise ValueError(f'"{skipped}" was marked as "skip" but is not in any provided dataframe (watch case-sensitivity?).') # Progress bar setup ratio_progress_of_df_summary_vs_feature = 1.0 number_features = len(filtered_series_names_in_source) exponential_checks = number_features * number_features progress_chunks = ratio_progress_of_df_summary_vs_feature \ + number_features + (0 if target_feature_name is not None else 0) self.progress_bar = tqdm(total=progress_chunks, bar_format= \ '{desc:45}|{bar}| [{percentage:3.0f}%] {elapsed} -> ({remaining} left)', \ ascii=False, dynamic_ncols=True, position=0, leave= True) # Summarize dataframe self.progress_bar.set_description_str("[Summarizing dataframe]") self.summary_source = dict() self.summarize_dataframe(source_df, self.source_name, self.summary_source, fc.skip) # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!! # if target_feature_name: # self.summary_source["num_columns"] = self.summary_source["num_columns"] - 1 if compare_df is not None: self.summary_compare = dict() self.summarize_dataframe(compare_df, self.compare_name, self.summary_compare, fc.skip) cmp_not_in_src = \ [name for name in all_compare_names if name not in all_source_names] self.summary_compare["num_cmp_not_in_source"] = len(cmp_not_in_src) # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!! # if target_feature_name: # if target_feature_name in compare_df.columns: # self.summary_compare["num_columns"] = self.summary_compare["num_columns"] - 1 else: self.summary_compare = None self.progress_bar.update(ratio_progress_of_df_summary_vs_feature) self.num_summaries = number_features # Association check if pairwise_analysis == 'auto' and \ number_features > config["Processing"].getint("association_auto_threshold"): print(f"PAIRWISE CALCULATION LENGTH WARNING: There are {number_features} features in " f"this dataframe and the " f"'pairwise_analysis' parameter is set to 'auto'.\nPairwise analysis is exponential in " f"length: {number_features} features will cause ~" f"{number_features * number_features} pairs to be " f"evaluated, which could take a long time.\n\nYou must call the function with the " f"parameter pairwise_analysis='on' or 'off' to explicitly select desired behavior." ) self.progress_bar.close() return # Validate and process TARGET target_to_process = None target_type = None if target_feature_name: # Make sure target exists self.progress_bar.set_description_str(f"Feature: {target_feature_name} (TARGET)") targets_found = [item for item in filtered_series_names_in_source if item == target_feature_name] if len(targets_found) == 0: self.progress_bar.close() raise KeyError(f"Feature '{target_feature_name}' was " f"specified as TARGET, but is NOT FOUND in " f"the dataframe (watch case-sensitivity?).") # Make sure target has no nan's if source_df[targets_found[0]].isnull().values.any(): self.progress_bar.close() raise ValueError(f"\nTarget feature '{targets_found[0]}' contains NaN (missing) values.\n" f"To avoid confusion in interpreting target distribution,\n" f"target features MUST NOT have any missing values at this time.\n") # Find Target in compared, if present compare_target_series = None if compare_df is not None: if target_feature_name in compare_df.columns: if compare_df[target_feature_name].isnull().values.any(): self.progress_bar.close() raise ValueError( f"\nTarget feature '{target_feature_name}' in COMPARED data contains NaN (missing) values.\n" f"To avoid confusion in interpreting target distribution,\n" f"target features MUST NOT have any missing values at this time.\n") compare_target_series = compare_df[target_feature_name] # TARGET processed HERE with COMPARE if present target_to_process = FeatureToProcess(-1, source_df[targets_found[0]], compare_target_series, None, None, fc.get_predetermined_type(targets_found[0])) self._target = sa.analyze_feature_to_dictionary(target_to_process) filtered_series_names_in_source.remove(targets_found[0]) target_type = self._target["type"] self.progress_bar.update(1) # Set final target series and sanitize targets (e.g. bool->truly bool) source_target_series = None compare_target_series = None if target_feature_name: if target_feature_name not in source_df.columns: raise ValueError if self._target["type"] == sa.FeatureType.TYPE_BOOL: source_target_series = self.get_sanitized_bool_series(source_df[target_feature_name]) else: source_target_series = source_df[target_feature_name] if compare_df is not None: if target_feature_name in compare_df.columns: if self._target["type"] == sa.FeatureType.TYPE_BOOL: compare_target_series = self.get_sanitized_bool_series(compare_df[ target_feature_name]) else: compare_target_series = compare_df[target_feature_name] # Create list of features to process features_to_process = [] for cur_series_name, cur_order_index in zip(filtered_series_names_in_source, range(0, len(filtered_series_names_in_source))): # TODO: BETTER HANDLING OF DIFFERENT COLUMNS IN SOURCE/COMPARE if compare_df is not None and cur_series_name in \ compare_df.columns: this_feat = FeatureToProcess(cur_order_index, source_df[cur_series_name], compare_df[cur_series_name], source_target_series, compare_target_series, fc.get_predetermined_type(cur_series_name), target_type) else: this_feat = FeatureToProcess(cur_order_index, source_df[cur_series_name], None, source_target_series, None, fc.get_predetermined_type(cur_series_name), target_type) features_to_process.append(this_feat) # Process columns -> features self.run_id = hex(int(time.time()))[2:] + "_" # removes the decimals # self.temp_folder = config["Files"].get("temp_folder") # os.makedirs(os.path.normpath(self.temp_folder), exist_ok=True) for f in features_to_process: # start = time.perf_counter() self.progress_bar.set_description_str(f"Feature: {f.source.name}") self._features[f.source.name] = sa.analyze_feature_to_dictionary(f) self.progress_bar.update(1) # print(f"DONE FEATURE------> {f.source.name}" # f" {(time.perf_counter() - start):.2f} {self._features[f.source.name]['type']}") # self.progress_bar.set_description_str('[FEATURES DONE]') # self.progress_bar.close() # Wrap up summary self.summarize_category_types(source_df, self.summary_source, fc.skip, self._target) if compare is not None: self.summarize_category_types(compare_df, self.summary_compare, fc.skip, self._target) self.dataframe_summary_html = sv_html.generate_html_dataframe_summary(self) self.graph_legend = GraphLegend(self) # Process all associations # ---------------------------------------------------- # Put target first if target_to_process is not None: features_to_process.insert(0,target_to_process) if pairwise_analysis.lower() != 'off': self.progress_bar.reset(total=len(features_to_process)) self.progress_bar.set_description_str("[Step 2/3] Processing Pairwise Features") self.process_associations(features_to_process, source_target_series, compare_target_series) self.progress_bar.reset(total=1) self.progress_bar.set_description_str("[Step 3/3] Generating associations graph") self.associations_html_source = True # Generated later in the process self.associations_html_compare = True # Generated later in the process self._association_graphs["all"] = GraphAssoc(self, "all", self._associations) self._association_graphs_compare["all"] = GraphAssoc(self, "all", self._associations_compare) self.progress_bar.set_description_str("Done! Use 'show' commands to display/save. ") self.progress_bar.update(1) else: self._associations = None self._associations_compare = None self.associations_html_source = None self.associations_html_compare = None self.progress_bar.close() return
def __init__(self, which_graph: str, to_process: FeatureToProcess): if to_process.is_target() and which_graph == "mini": styles = ["graph_base.mplstyle", "graph_target.mplstyle"] else: styles = ["graph_base.mplstyle"] self.set_style(styles) is_detail = which_graph.find("detail") != -1 cycle_colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] if which_graph == "mini": max_categories = config["Graphs"].getint( "summary_graph_max_categories") elif is_detail: max_categories = config["Graphs"].getint( "detail_graph_max_categories") else: raise ValueError plot_data_series = utils.get_clamped_value_counts( \ to_process.source_counts["value_counts_without_nan"], max_categories) if which_graph == "mini": f, axs = plt.subplots(1, 1, \ figsize=(config["Graphs"].getfloat("cat_summary_graph_width"), config["Graphs"].getfloat("summary_graph_height"))) gap_percent = config["Graphs"].getfloat( "summary_graph_categorical_gap") axs.tick_params(axis='x', direction='out', pad=0, labelsize=8, length=2) axs.tick_params(axis='y', direction='out', pad=2, labelsize=8, length=2) axs.xaxis.tick_top() elif is_detail: height = config["Graphs"].getfloat("detail_graph_height_base") \ + config["Graphs"].getfloat("detail_graph_height_per_elem") * len(plot_data_series) if height > config["Graphs"].getfloat( "detail_graph_categorical_max_height"): # Shrink height to fit, past a certain number height = config["Graphs"].getfloat( "detail_graph_categorical_max_height") f, axs = plt.subplots(1, 1, \ figsize=(config["Graphs"].getfloat("detail_graph_width"), height)) gap_percent = config["Graphs"].getfloat( "detail_graph_categorical_gap") axs.tick_params(axis='x', direction='out', pad=0, labelsize=8, length=2) axs.tick_params(axis='y', direction='out', pad=2, labelsize=8, length=2) axs.xaxis.tick_top() self.size_in_inches = f.get_size_inches() tick_names = list(plot_data_series.index) # To show percentages sum_source = sum(plot_data_series) plot_data_series = plot_data_series / sum_source if sum_source != 0.0 else plot_data_series * 0.0 axs.xaxis.set_major_formatter( mtick.PercentFormatter(xmax=1.0, decimals=0)) # MAIN DATA (renders "under" target plots) # ----------------------------------------------------------- if to_process.compare is not None: # COMPARE matched_data_series = utils.get_matched_value_counts( \ to_process.compare_counts["value_counts_without_nan"],plot_data_series) # Show percentages sum_compared = sum(matched_data_series) matched_data_series = matched_data_series / sum_compared if sum_compared != 0.0 else \ matched_data_series * 0.0 height_lists = [ list(plot_data_series.values), list(matched_data_series) ] else: height_lists = [list(plot_data_series.values)] # Reorder so it plots with max values on top, "Others" at bottom # Plot: index 0 at BOTTOM # Need to change TICK NAMES and all elements in height_lists # --------------------------------------------- reversed_height_lists = list() for height_list in height_lists: reversed_height_lists.append(list(reversed(height_list))) tick_names = list(reversed(tick_names)) height_lists = reversed_height_lists try: others_index = tick_names.index(OTHERS_GROUPED) tick_names.insert(0, tick_names.pop(others_index)) for height_list in height_lists: height_list.insert(0, height_list.pop(others_index)) except: pass # colors = ("r", "b") category_centers, bar_width = \ plot_grouped_bars(tick_names, height_lists, cycle_colors, gap_percent, orientation = 'horizontal', axis_obj = axs) # TARGET # ----------------------------------------------------------- if to_process.source_target is not None: if to_process.predetermined_type_target == FeatureType.TYPE_NUM: # TARGET: IS NUMERIC target_values_source = list() names_excluding_others = [ key for key in tick_names if key != OTHERS_GROUPED ] for name in tick_names: if name == OTHERS_GROUPED: tick_average = to_process.source_target[ \ ~to_process.source.isin(names_excluding_others)].mean() else: tick_average = to_process.source_target[ \ to_process.source == name].mean() target_values_source.append(tick_average) ax2 = axs.twiny() ax2.xaxis.set_major_formatter( mtick.FuncFormatter(self.format_smart)) ax2.xaxis.tick_bottom() # Need to redo this for some reason after twinning: axs.xaxis.tick_top() ax2.tick_params(axis='x', direction='out', pad=2, labelsize=8, length=2) ax2.plot(target_values_source, category_centers, marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE) if to_process.compare is not None and \ to_process.compare_target is not None: # TARGET NUMERIC: with compare TARGET target_values_compare = list() for name in tick_names: if name == OTHERS_GROUPED: tick_average = to_process.compare_target[ \ ~to_process.compare.isin(names_excluding_others)].mean() else: tick_average = to_process.compare_target[ \ to_process.compare == name].mean() target_values_compare.append(tick_average) ax2.plot(target_values_compare, category_centers, marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE) elif to_process.predetermined_type_target == FeatureType.TYPE_BOOL: # TARGET: IS BOOL # ------------------------------------ target_values_source = list() names_excluding_others = [ key for key in tick_names if key != OTHERS_GROUPED ] for name in tick_names: if name == OTHERS_GROUPED: tick_num = sv_math.count_fraction_of_true(to_process.source_target[ \ ~to_process.source.isin(names_excluding_others)])[0] else: tick_num = sv_math.count_fraction_of_true(to_process.source_target[ \ to_process.source == name])[0] target_values_source.append(tick_num) # target_values_source.append(tick_num * plot_data_series[name]) # ax2 = axs.twiny() # ax2.xaxis.set_major_formatter(mtick.FuncFormatter(self.format_smart)) # ax2.xaxis.tick_bottom() # # Need to redo this for some reason after twinning: # axs.xaxis.tick_top() # ax2.tick_params(axis='x', direction='out', pad=2, labelsize=8, length=2) axs.plot(target_values_source, category_centers, marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE) target_values_compare = list() if to_process.compare is not None and \ to_process.compare_target is not None: # TARGET BOOL: with compare TARGET for name in tick_names: if name == OTHERS_GROUPED: tick_num = sv_math.count_fraction_of_true(to_process.compare_target[ \ ~to_process.compare.isin(names_excluding_others)])[0] else: tick_num = sv_math.count_fraction_of_true(to_process.compare_target[ \ to_process.compare == name])[0] target_values_compare.append(tick_num) # target_values_compare.append(tick_num * matched_data_series[name]) axs.plot(target_values_compare, category_centers, marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE) # else: # # TARGET BOOL: NO compare TARGET -> Just fill with zeros so alignment is still good # for name in tick_names: # target_values_compare.append(0.0) # target_plot_series = [target_values_source, target_values_compare] # plot_grouped_bars(tick_names, target_plot_series, ('k','k'), gap_percent, # orientation='horizontal', axis_obj=axs, alpha=0.6) # Finalize Graph # ----------------------------- # Needs only ~5 on right, but want to match num if which_graph == "mini": needed_pixels_padding = np.array([14.0, (300 + 32), 14, 45]) # TOP-LEFT-BOTTOM-RIGHT else: needed_pixels_padding = np.array([14.0, 140, 16, 45]) # TOP-LEFT-BOTTOM-RIGHT padding_fraction = needed_pixels_padding padding_fraction[0] = padding_fraction[0] / (self.size_in_inches[1] * f.dpi) padding_fraction[2] = padding_fraction[2] / (self.size_in_inches[1] * f.dpi) padding_fraction[3] = padding_fraction[3] / (self.size_in_inches[0] * f.dpi) padding_fraction[1] = padding_fraction[1] / (self.size_in_inches[0] * f.dpi) plt.subplots_adjust(top=(1.0 - padding_fraction[0]), left=padding_fraction[1], \ bottom=padding_fraction[2], right=(1.0 - padding_fraction[3])) self.graph_base64 = self.get_encoded_base64(f) plt.close('all')
def __init__(self, which_graph: str, to_process: FeatureToProcess): if to_process.is_target() and which_graph == "mini": styles = ["graph_base.mplstyle", "graph_target.mplstyle"] else: styles = ["graph_base.mplstyle"] self.set_style(styles) is_detail = which_graph.find("detail") != -1 if which_graph == "mini": f, axs = plt.subplots(1, 1, \ figsize=(config["Graphs"].getfloat("num_summary_graph_width"), config["Graphs"].getfloat("summary_graph_height"))) self.num_bins = None elif is_detail: f, axs = plt.subplots(1, 1, \ figsize=(config["Graphs"].getfloat("detail_graph_width"), config["Graphs"].getfloat("detail_graph_height_numeric"))) split = which_graph.split("-") self.index_for_css = split[1] self.num_bins = int(split[1]) self.button_name = self.index_for_css # 0 is "auto" if self.num_bins == 0: self.num_bins = None self.button_name = "Auto" else: raise ValueError axs.tick_params(axis='x', direction='out', pad=2, labelsize=8, length=2) axs.tick_params(axis='y', direction='out', pad=2, labelsize=8, length=2) axs.xaxis.set_major_formatter(mtick.FuncFormatter(self.format_smart)) axs.yaxis.set_major_formatter( mtick.PercentFormatter(xmax=1.0, decimals=0)) # MAIN DATA ("Under" target) # --------------------------------------------- np.seterr(all='raise') # WORKAROUND histogram warnings cleaned_source = to_process.source[~np.isnan(to_process.source)] if len(cleaned_source): norm_source = np.full(len(cleaned_source), 1.0 / len(cleaned_source)) else: norm_source = [] if to_process.compare is not None: # COMPARE cleaned_compare = to_process.compare[~np.isnan(to_process.compare)] plot_data = (cleaned_source, cleaned_compare) if len(cleaned_compare): norm_compare = np.full(len(cleaned_compare), 1.0 / len(cleaned_compare)) else: norm_compare = [] normalizing_weights = (norm_source, norm_compare) else: plot_data = cleaned_source normalizing_weights = norm_source gap_percent = config["Graphs"].getfloat( "summary_graph_categorical_gap") self.hist_specs = axs.hist(plot_data, weights = normalizing_weights, bins=self.num_bins, \ rwidth = (100.0 - gap_percent) / 100.0) bin_limits = self.hist_specs[1] num_bins = len(bin_limits) - 1 bin_counts = self.hist_specs[0] # Format x ticks x_ticks = plt.xticks() # tick_range = max(x_ticks[0]) - min(x_ticks[0]) new_labels = [ sv_html_formatters.fmt_smart_range_tight(val, max(x_ticks[0])) for val in x_ticks[0] ] plt.xticks(x_ticks[0], new_labels) # TARGET # --------------------------------------------- if to_process.source_target is not None: if to_process.predetermined_type_target == FeatureType.TYPE_NUM: # TARGET: IS NUMERIC # Create a series where each item indicates its bin # TODO: possible 1-off bug in counts from cut in lower bin source_bins_series = pd.cut(to_process.source, bins=bin_limits, labels=False) # Create empty bin_averages, then fill in with values bin_averages = [None] * num_bins for b in range(0, num_bins): bin_averages[b] = \ to_process.source_target[source_bins_series == b].mean() # TODO: verify number of bins bin_offset_x = (bin_limits[1] - bin_limits[0]) / 2.0 ax2 = axs.twinx() ax2.yaxis.set_major_formatter( mtick.FuncFormatter(self.format_smart)) ax2.plot(bin_limits[:-1] + bin_offset_x, bin_averages, \ marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE) if to_process.compare is not None and \ to_process.compare_target is not None: # TARGET NUMERIC: with compare TARGET compare_bins_series = pd.cut(to_process.compare, bins=bin_limits, labels=False) bin_averages = [None] * num_bins for b in range(0, num_bins): bin_averages[b] = \ to_process.compare_target[compare_bins_series == b].mean() ax2.plot(bin_limits[:-1] + bin_offset_x, bin_averages, \ marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE) elif to_process.predetermined_type_target == FeatureType.TYPE_BOOL: # TARGET: IS BOOL source_true = to_process.source[to_process.source_target == 1] source_bins_series = pd.cut(source_true, bins=bin_limits, labels=False) total_counts_source = bin_counts[ 0] if to_process.compare is not None else bin_counts total_counts_source = total_counts_source * len(cleaned_source) bin_true_counts_source = [None] * num_bins for b in range(0, num_bins): if total_counts_source[b] > 0: bin_true_counts_source[b] = \ source_true[source_bins_series == b].count() \ / total_counts_source[b] else: bin_true_counts_source[b] = None # TODO: verify number of bins bin_offset_x = (bin_limits[1] - bin_limits[0]) / 2.0 # bin_offset_x = 0 # Share % axis # ax2 = axs.twinx() ax2 = axs ax2.yaxis.set_major_formatter( mtick.PercentFormatter(xmax=1.0, decimals=0)) ax2.plot(bin_limits[:-1] + bin_offset_x, bin_true_counts_source, \ marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE) if to_process.compare is not None and \ to_process.compare_target is not None: # TARGET BOOL: with compare TARGET compare_true = to_process.compare[to_process.compare_target == 1] # Create a series where each item indicates its bin # TODO: possible 1-off bug in counts from cut in lower bin compare_bins_series = pd.cut(compare_true, bins=bin_limits, labels=False) total_counts_compare = bin_counts[1] * len(cleaned_compare) bin_true_counts_compare = [None] * num_bins for b in range(0, num_bins): if total_counts_compare[b] > 0: bin_true_counts_compare[b] = \ compare_true[compare_bins_series == b].count() \ / total_counts_compare[b] else: bin_true_counts_compare[b] = None ax2.plot(bin_limits[:-1] + bin_offset_x, bin_true_counts_compare, \ marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE) ax2.set_ylim([0, None]) # elif to_process.compare is not None: # # TARGET BOOL: only on source, but there's a compare # source_true = to_process.source[to_process.source_target == 1] # normalizing_weights = np.full(len(source_true), # 1.0 / len(to_process.source)) # b, x, patches = axs.hist(to_process.source[to_process.source_target == 1], # bins = bin_limits, color = ("k"), alpha = 0.8, # weights = normalizing_weights, rwidth = 0.4) # # # Make positions of target patches match original patches # for target_patch, source_patch in zip(patches, self.hist_specs[2][0]): # target_patch.set_x(source_patch.get_x()) # # # Values # if is_detail: # axs.annotate(f'{int(source_patch.get_height())}', xy=(source_patch.get_x() + # source_patch.get_width() / 2, source_patch.get_height()), # xytext=(0, 5), textcoords='offset points', ha='center', va='bottom') # else: # # TARGET BOOL: with only a source # source_true = to_process.source[to_process.source_target == 1] # normalizing_weights = np.full(len(source_true), # 1.0 / len(to_process.source)) # axs.hist(source_true, bins = bin_limits, # color = 'k', alpha = 0.8, weights = normalizing_weights) else: raise ValueError # Finalize Graph # ----------------------------- self.size_in_inches = f.get_size_inches() if which_graph == "mini": needed_pixels_padding = np.array([4.0, 32, 15, 45]) # TOP-LEFT-BOTTOM-RIGHT else: needed_pixels_padding = np.array([5.0, 32, 15, 45]) # TOP-LEFT-BOTTOM-RIGHT padding_fraction = needed_pixels_padding padding_fraction[0] = padding_fraction[0] / (self.size_in_inches[1] * f.dpi) padding_fraction[2] = padding_fraction[2] / (self.size_in_inches[1] * f.dpi) padding_fraction[3] = padding_fraction[3] / (self.size_in_inches[0] * f.dpi) padding_fraction[1] = padding_fraction[1] / (self.size_in_inches[0] * f.dpi) plt.subplots_adjust(top=(1.0 - padding_fraction[0]), left=padding_fraction[1], \ bottom=padding_fraction[2], right=(1.0 - padding_fraction[3])) self.graph_base64 = self.get_encoded_base64(f) plt.close('all') #plt.close(f) # print(matplotlib.rcParams) return