Esempio n. 1
0
def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict:
    # start = time.perf_counter()

    # Validation: Make sure the targets are the same length as the series
    if to_process.source_target is not None and to_process.source is not None:
        if len(to_process.source_target) != len(to_process.source):
            raise ValueError
    if to_process.compare_target is not None and to_process.compare is not None:
        if len(to_process.compare_target) != len(to_process.compare):
            raise ValueError

    # Initialize some dictionary values
    returned_feature_dict = dict()
    returned_feature_dict["name"] = to_process.source.name
    returned_feature_dict["order_index"] = to_process.order
    returned_feature_dict["is_target"] = True if to_process.order == -1 else False

    # Determine SOURCE feature type
    to_process.source_counts = get_counts(to_process.source)
    returned_feature_dict["type"] = determine_feature_type(to_process.source, to_process.source_counts,
                                                           to_process.predetermined_type, "SOURCE")

    # Determine COMPARED feature type & initialize
    compare_dict = None
    if to_process.compare is not None:
        to_process.compare_counts = get_counts(to_process.compare)
        compare_type = determine_feature_type(to_process.compare,
                                              to_process.compare_counts,
                                              returned_feature_dict["type"], "COMPARED")
        # Explicitly show missing categories on each set
        if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL:
            fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts)
            fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts)
        returned_feature_dict["compare"] = dict()
        compare_dict = returned_feature_dict["compare"]
        compare_dict["type"] = compare_type

    # Establish base stats
    add_series_base_stats_to_dict(to_process.source, to_process.source_counts, returned_feature_dict)
    if to_process.compare is not None:
        add_series_base_stats_to_dict(to_process.compare, to_process.compare_counts, compare_dict)

    # Perform full analysis on source/compare/target
    if returned_feature_dict["type"] == FeatureType.TYPE_NUM:
        sweetviz.series_analyzer_numeric.analyze(to_process, returned_feature_dict)
    elif returned_feature_dict["type"] == FeatureType.TYPE_CAT:
        sweetviz.series_analyzer_cat.analyze(to_process, returned_feature_dict)
    elif returned_feature_dict["type"] == FeatureType.TYPE_BOOL:
        sweetviz.series_analyzer_cat.analyze(to_process, returned_feature_dict)
    elif returned_feature_dict["type"] == FeatureType.TYPE_TEXT:
        sweetviz.series_analyzer_text.analyze(to_process, returned_feature_dict)
    else:
        raise ValueError

    # print(f"{to_process.source.name} PROCESSED ------> "
    #       f" {time.perf_counter() - start}")

    return returned_feature_dict
def analyze(to_process: FeatureToProcess, feature_dict: dict):
    compare_dict = feature_dict.get("compare")
    feature_dict["stats"] = dict()
    if compare_dict:
        compare_dict["stats"] = dict()

    do_detail_text(to_process, feature_dict)

    if to_process.is_target():
        raise ValueError
    else:
        feature_dict["html_summary"] = sv_html.generate_html_summary_text(
            feature_dict, compare_dict)
Esempio n. 3
0
def analyze(to_process: FeatureToProcess, feature_dict: dict):
    compare_dict = feature_dict.get("compare")
    feature_dict["stats"] = dict()
    if compare_dict:
        compare_dict["stats"] = dict()

    do_detail_categorical(to_process, feature_dict)

    feature_dict["minigraph"] = GraphCat("mini", to_process)
    feature_dict["detail_graphs"] = list()
    feature_dict["detail_graphs"].append(GraphCat("detail", to_process))

    if to_process.is_target():
        feature_dict[
            "html_summary"] = sv_html.generate_html_summary_target_cat(
                feature_dict, compare_dict)
    else:
        feature_dict["html_summary"] = sv_html.generate_html_summary_cat(
            feature_dict, compare_dict)

    return
Esempio n. 4
0
def analyze(to_process: FeatureToProcess, feature_dict: dict):
    do_stats_numeric(to_process.source, feature_dict)
    compare_dict = feature_dict.get("compare")
    if compare_dict:
        do_stats_numeric(to_process.compare, compare_dict)

    do_detail_numeric(to_process.source, to_process.source_counts,
                      to_process.compare_counts, feature_dict)

    feature_dict["minigraph"] = GraphNumeric("mini", to_process)
    feature_dict["detail_graphs"] = list()
    for num_bins in [0, 5, 15, 30]:
        new_graph = GraphNumeric("detail-" + str(num_bins), to_process)
        if new_graph:
            feature_dict["detail_graphs"].append(new_graph)

    if to_process.is_target():
        feature_dict[
            "html_summary"] = sv_html.generate_html_summary_target_numeric(
                feature_dict, compare_dict)
    else:
        feature_dict["html_summary"] = sv_html.generate_html_summary_numeric(
            feature_dict, compare_dict)
Esempio n. 5
0
    def __init__(self,
                 source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
                 target_feature_name: str = None,
                 compare: Union[pd.DataFrame, Tuple[pd.DataFrame, str]] = None,
                 pairwise_analysis: str = 'auto',
                 fc: FeatureConfig = None):
        pairwise_analysis = pairwise_analysis.lower()
        if pairwise_analysis not in ["on", "auto", "off"]:
            raise ValueError('"pairwise_analysis" parameter should be one of: "on", "auto", "off"')

        sv_html.load_layout_globals_from_config()

        self._jupyter_html = ""
        self._page_html = ""
        self._features = dict()
        self.compare_name = None
        self._target = None
        self.test_mode = False
        if fc is None:
            fc = FeatureConfig()

        # Associations: _associations[FEATURE][GIVES INFORMATION ABOUT THIS FEATURE]
        self._associations = dict()
        self._associations_compare = dict()
        self._association_graphs = dict()
        self._association_graphs_compare = dict()

        # Handle source and compare dataframes and names
        if type(source) == pd.DataFrame:
            source_df = source
            self.source_name = "DataFrame"
        elif type(source) == list or type(source) == tuple:
            if len(source) != 2:
                raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
            source_df = source[0]
            self.source_name = source[1]
        else:
            raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
        if len(su.get_duplicate_cols(source_df)) > 0:
            raise ValueError('Duplicate column names detected in "source"; this is not supported.')

        # NEW (12-14-2020): Rename indices that use the reserved name "index"
        # From pandas-profiling:
        # If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column} to be `df_index`.
        if 'index' in source_df.columns:
            source_df = source_df.rename(columns={"index": "df_index"})
            if target_feature_name == 'index':
                target_feature_name = 'df_index'

        all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
        if compare is None:
            compare_df = None
            self.compare_name = None
            all_compare_names = list()
        elif type(compare) == pd.DataFrame:
            compare_df = compare
            if 'index' in compare_df.columns:
                compare_df = compare_df.rename(columns={"index": "df_index"})
            self.compare_name = "Compared"
            all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()]
        elif type(compare) == list or type(compare) == tuple:
            if len(compare) != 2:
                raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
            compare_df = compare[0]
            if 'index' in compare_df.columns:
                compare_df = compare_df.rename(columns={"index": "df_index"})
            self.compare_name = compare[1]
            all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()]
        else:
            raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')

        # Validate some params
        if compare_df is not None and len(su.get_duplicate_cols(compare_df)) > 0:
            raise ValueError('Duplicate column names detected in "compare"; this is not supported.')


        if target_feature_name in fc.skip:
            raise ValueError(f'"{target_feature_name}" was also specified as "skip". Target cannot be skipped.')

        for key in fc.get_all_mentioned_features():
            if key not in all_source_names:
                raise ValueError(f'"{key}" was specified in "feature_config" but is not found in source dataframe (watch case-sensitivity?).')

        # Find Features and Target (FILTER SKIPPED)
        filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()
                                           if cur_name not in fc.skip]
        for skipped in fc.skip:
            if skipped not in all_source_names and skipped not in all_compare_names:
                raise ValueError(f'"{skipped}" was marked as "skip" but is not in any provided dataframe (watch case-sensitivity?).')

        # Progress bar setup
        ratio_progress_of_df_summary_vs_feature = 1.0
        number_features = len(filtered_series_names_in_source)
        exponential_checks = number_features * number_features
        progress_chunks = ratio_progress_of_df_summary_vs_feature \
                            + number_features + (0 if target_feature_name is not None else 0)

        self.progress_bar = tqdm(total=progress_chunks, bar_format= \
                '{desc:45}|{bar}| [{percentage:3.0f}%]   {elapsed} -> ({remaining} left)', \
                ascii=False, dynamic_ncols=True, position=0, leave= True)

        # Summarize dataframe
        self.progress_bar.set_description_str("[Summarizing dataframe]")
        self.summary_source = dict()
        self.summarize_dataframe(source_df, self.source_name, self.summary_source, fc.skip)
        # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!!
        # if target_feature_name:
        #     self.summary_source["num_columns"] = self.summary_source["num_columns"] - 1
        if compare_df is not None:
            self.summary_compare = dict()
            self.summarize_dataframe(compare_df, self.compare_name, self.summary_compare, fc.skip)
            cmp_not_in_src = \
                [name for name in all_compare_names if name not in all_source_names]
            self.summary_compare["num_cmp_not_in_source"] = len(cmp_not_in_src)
            # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!!
            # if target_feature_name:
            #     if target_feature_name in compare_df.columns:
            #         self.summary_compare["num_columns"] = self.summary_compare["num_columns"] - 1
        else:
            self.summary_compare = None
        self.progress_bar.update(ratio_progress_of_df_summary_vs_feature)

        self.num_summaries = number_features

        # Association check
        if pairwise_analysis == 'auto' and \
                number_features > config["Processing"].getint("association_auto_threshold"):
            print(f"PAIRWISE CALCULATION LENGTH WARNING: There are {number_features} features in "
                  f"this dataframe and the "
                  f"'pairwise_analysis' parameter is set to 'auto'.\nPairwise analysis is exponential in "
                  f"length: {number_features} features will cause ~"
                  f"{number_features * number_features} pairs to be "
                  f"evaluated, which could take a long time.\n\nYou must call the function with the "
                  f"parameter pairwise_analysis='on' or 'off' to explicitly select desired behavior."
                  )
            self.progress_bar.close()
            return

        # Validate and process TARGET
        target_to_process = None
        target_type = None
        if target_feature_name:
            # Make sure target exists
            self.progress_bar.set_description_str(f"Feature: {target_feature_name} (TARGET)")
            targets_found = [item for item in filtered_series_names_in_source
                             if item == target_feature_name]
            if len(targets_found) == 0:
                self.progress_bar.close()
                raise KeyError(f"Feature '{target_feature_name}' was "
                               f"specified as TARGET, but is NOT FOUND in "
                               f"the dataframe (watch case-sensitivity?).")

            # Make sure target has no nan's
            if source_df[targets_found[0]].isnull().values.any():
                self.progress_bar.close()
                raise ValueError(f"\nTarget feature '{targets_found[0]}' contains NaN (missing) values.\n"
                               f"To avoid confusion in interpreting target distribution,\n"
                               f"target features MUST NOT have any missing values at this time.\n")

            # Find Target in compared, if present
            compare_target_series = None
            if compare_df is not None:
                if target_feature_name in compare_df.columns:
                    if compare_df[target_feature_name].isnull().values.any():
                        self.progress_bar.close()
                        raise ValueError(
                            f"\nTarget feature '{target_feature_name}' in COMPARED data contains NaN (missing) values.\n"
                            f"To avoid confusion in interpreting target distribution,\n"
                            f"target features MUST NOT have any missing values at this time.\n")
                    compare_target_series = compare_df[target_feature_name]

            # TARGET processed HERE with COMPARE if present
            target_to_process = FeatureToProcess(-1, source_df[targets_found[0]], compare_target_series,
                                                 None, None, fc.get_predetermined_type(targets_found[0]))
            self._target = sa.analyze_feature_to_dictionary(target_to_process)
            filtered_series_names_in_source.remove(targets_found[0])
            target_type = self._target["type"]
            self.progress_bar.update(1)

        # Set final target series and sanitize targets (e.g. bool->truly bool)
        source_target_series = None
        compare_target_series = None
        if target_feature_name:
            if target_feature_name not in source_df.columns:
                raise ValueError
            if self._target["type"] == sa.FeatureType.TYPE_BOOL:
                source_target_series = self.get_sanitized_bool_series(source_df[target_feature_name])
            else:
                source_target_series = source_df[target_feature_name]

            if compare_df is not None:
                if target_feature_name in compare_df.columns:
                    if self._target["type"] == sa.FeatureType.TYPE_BOOL:
                        compare_target_series = self.get_sanitized_bool_series(compare_df[
                                                                                   target_feature_name])
                    else:
                        compare_target_series = compare_df[target_feature_name]

        # Create list of features to process
        features_to_process = []
        for cur_series_name, cur_order_index in zip(filtered_series_names_in_source,
                                                 range(0, len(filtered_series_names_in_source))):
            # TODO: BETTER HANDLING OF DIFFERENT COLUMNS IN SOURCE/COMPARE
            if compare_df is not None and cur_series_name in \
                    compare_df.columns:
                this_feat = FeatureToProcess(cur_order_index,
                                             source_df[cur_series_name],
                                             compare_df[cur_series_name],
                                             source_target_series,
                                             compare_target_series,
                                             fc.get_predetermined_type(cur_series_name),
                                             target_type)
            else:
                this_feat = FeatureToProcess(cur_order_index,
                                             source_df[cur_series_name],
                                             None,
                                             source_target_series,
                                             None,
                                             fc.get_predetermined_type(cur_series_name),
                                             target_type)
            features_to_process.append(this_feat)


        # Process columns -> features
        self.run_id = hex(int(time.time()))[2:] + "_" # removes the decimals
        # self.temp_folder = config["Files"].get("temp_folder")
        # os.makedirs(os.path.normpath(self.temp_folder), exist_ok=True)

        for f in features_to_process:
            # start = time.perf_counter()
            self.progress_bar.set_description_str(f"Feature: {f.source.name}")
            self._features[f.source.name] = sa.analyze_feature_to_dictionary(f)
            self.progress_bar.update(1)
            # print(f"DONE FEATURE------> {f.source.name}"
            #       f" {(time.perf_counter() - start):.2f}   {self._features[f.source.name]['type']}")
        # self.progress_bar.set_description_str('[FEATURES DONE]')
        # self.progress_bar.close()

        # Wrap up summary
        self.summarize_category_types(source_df, self.summary_source, fc.skip, self._target)
        if compare is not None:
            self.summarize_category_types(compare_df, self.summary_compare, fc.skip, self._target)
        self.dataframe_summary_html = sv_html.generate_html_dataframe_summary(self)

        self.graph_legend = GraphLegend(self)

        # Process all associations
        # ----------------------------------------------------
        # Put target first
        if target_to_process is not None:
            features_to_process.insert(0,target_to_process)

        if pairwise_analysis.lower() != 'off':
            self.progress_bar.reset(total=len(features_to_process))
            self.progress_bar.set_description_str("[Step 2/3] Processing Pairwise Features")
            self.process_associations(features_to_process, source_target_series, compare_target_series)

            self.progress_bar.reset(total=1)
            self.progress_bar.set_description_str("[Step 3/3] Generating associations graph")
            self.associations_html_source = True # Generated later in the process
            self.associations_html_compare = True # Generated later in the process
            self._association_graphs["all"] = GraphAssoc(self, "all", self._associations)
            self._association_graphs_compare["all"] = GraphAssoc(self, "all", self._associations_compare)
            self.progress_bar.set_description_str("Done! Use 'show' commands to display/save. ")
            self.progress_bar.update(1)
        else:
            self._associations = None
            self._associations_compare = None
            self.associations_html_source = None
            self.associations_html_compare = None
        self.progress_bar.close()
        return
Esempio n. 6
0
    def __init__(self, which_graph: str, to_process: FeatureToProcess):
        if to_process.is_target() and which_graph == "mini":
            styles = ["graph_base.mplstyle", "graph_target.mplstyle"]
        else:
            styles = ["graph_base.mplstyle"]
        self.set_style(styles)

        is_detail = which_graph.find("detail") != -1
        cycle_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

        if which_graph == "mini":
            max_categories = config["Graphs"].getint(
                "summary_graph_max_categories")
        elif is_detail:
            max_categories = config["Graphs"].getint(
                "detail_graph_max_categories")
        else:
            raise ValueError
        plot_data_series = utils.get_clamped_value_counts( \
            to_process.source_counts["value_counts_without_nan"], max_categories)

        if which_graph == "mini":
            f, axs = plt.subplots(1, 1, \
                                  figsize=(config["Graphs"].getfloat("cat_summary_graph_width"),
                                           config["Graphs"].getfloat("summary_graph_height")))
            gap_percent = config["Graphs"].getfloat(
                "summary_graph_categorical_gap")
            axs.tick_params(axis='x',
                            direction='out',
                            pad=0,
                            labelsize=8,
                            length=2)
            axs.tick_params(axis='y',
                            direction='out',
                            pad=2,
                            labelsize=8,
                            length=2)
            axs.xaxis.tick_top()
        elif is_detail:
            height = config["Graphs"].getfloat("detail_graph_height_base") \
                + config["Graphs"].getfloat("detail_graph_height_per_elem") * len(plot_data_series)
            if height > config["Graphs"].getfloat(
                    "detail_graph_categorical_max_height"):
                # Shrink height to fit, past a certain number
                height = config["Graphs"].getfloat(
                    "detail_graph_categorical_max_height")
            f, axs = plt.subplots(1, 1, \
                                  figsize=(config["Graphs"].getfloat("detail_graph_width"), height))
            gap_percent = config["Graphs"].getfloat(
                "detail_graph_categorical_gap")
            axs.tick_params(axis='x',
                            direction='out',
                            pad=0,
                            labelsize=8,
                            length=2)
            axs.tick_params(axis='y',
                            direction='out',
                            pad=2,
                            labelsize=8,
                            length=2)
            axs.xaxis.tick_top()

        self.size_in_inches = f.get_size_inches()
        tick_names = list(plot_data_series.index)

        # To show percentages
        sum_source = sum(plot_data_series)
        plot_data_series = plot_data_series / sum_source if sum_source != 0.0 else plot_data_series * 0.0
        axs.xaxis.set_major_formatter(
            mtick.PercentFormatter(xmax=1.0, decimals=0))

        # MAIN DATA (renders "under" target plots)
        # -----------------------------------------------------------
        if to_process.compare is not None:
            # COMPARE
            matched_data_series = utils.get_matched_value_counts( \
                to_process.compare_counts["value_counts_without_nan"],plot_data_series)
            # Show percentages
            sum_compared = sum(matched_data_series)
            matched_data_series = matched_data_series / sum_compared if sum_compared != 0.0 else \
                matched_data_series * 0.0

            height_lists = [
                list(plot_data_series.values),
                list(matched_data_series)
            ]
        else:
            height_lists = [list(plot_data_series.values)]

        # Reorder so it plots with max values on top, "Others" at bottom
        # Plot: index 0 at BOTTOM
        # Need to change TICK NAMES and all elements in height_lists
        # ---------------------------------------------
        reversed_height_lists = list()
        for height_list in height_lists:
            reversed_height_lists.append(list(reversed(height_list)))
        tick_names = list(reversed(tick_names))
        height_lists = reversed_height_lists

        try:
            others_index = tick_names.index(OTHERS_GROUPED)
            tick_names.insert(0, tick_names.pop(others_index))
            for height_list in height_lists:
                height_list.insert(0, height_list.pop(others_index))
        except:
            pass

        # colors = ("r", "b")
        category_centers, bar_width = \
            plot_grouped_bars(tick_names, height_lists, cycle_colors, gap_percent,
                              orientation = 'horizontal', axis_obj = axs)

        # TARGET
        # -----------------------------------------------------------
        if to_process.source_target is not None:
            if to_process.predetermined_type_target == FeatureType.TYPE_NUM:
                # TARGET: IS NUMERIC
                target_values_source = list()
                names_excluding_others = [
                    key for key in tick_names if key != OTHERS_GROUPED
                ]
                for name in tick_names:
                    if name == OTHERS_GROUPED:
                        tick_average = to_process.source_target[ \
                            ~to_process.source.isin(names_excluding_others)].mean()
                    else:
                        tick_average = to_process.source_target[ \
                            to_process.source == name].mean()
                    target_values_source.append(tick_average)
                ax2 = axs.twiny()
                ax2.xaxis.set_major_formatter(
                    mtick.FuncFormatter(self.format_smart))
                ax2.xaxis.tick_bottom()
                # Need to redo this for some reason after twinning:
                axs.xaxis.tick_top()
                ax2.tick_params(axis='x',
                                direction='out',
                                pad=2,
                                labelsize=8,
                                length=2)
                ax2.plot(target_values_source,
                         category_centers,
                         marker='o',
                         color=sweetviz.graph.COLOR_TARGET_SOURCE)

                if to_process.compare is not None and \
                        to_process.compare_target is not None:
                    # TARGET NUMERIC: with compare TARGET
                    target_values_compare = list()
                    for name in tick_names:
                        if name == OTHERS_GROUPED:
                            tick_average = to_process.compare_target[ \
                                ~to_process.compare.isin(names_excluding_others)].mean()
                        else:
                            tick_average = to_process.compare_target[ \
                                to_process.compare == name].mean()
                        target_values_compare.append(tick_average)
                    ax2.plot(target_values_compare,
                             category_centers,
                             marker='o',
                             color=sweetviz.graph.COLOR_TARGET_COMPARE)

            elif to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
                # TARGET: IS BOOL
                # ------------------------------------
                target_values_source = list()
                names_excluding_others = [
                    key for key in tick_names if key != OTHERS_GROUPED
                ]
                for name in tick_names:
                    if name == OTHERS_GROUPED:
                        tick_num = sv_math.count_fraction_of_true(to_process.source_target[ \
                            ~to_process.source.isin(names_excluding_others)])[0]
                    else:
                        tick_num = sv_math.count_fraction_of_true(to_process.source_target[ \
                            to_process.source == name])[0]
                    target_values_source.append(tick_num)
                    # target_values_source.append(tick_num * plot_data_series[name])

                # ax2 = axs.twiny()
                # ax2.xaxis.set_major_formatter(mtick.FuncFormatter(self.format_smart))
                # ax2.xaxis.tick_bottom()
                # # Need to redo this for some reason after twinning:
                # axs.xaxis.tick_top()
                # ax2.tick_params(axis='x', direction='out', pad=2, labelsize=8, length=2)
                axs.plot(target_values_source,
                         category_centers,
                         marker='o',
                         color=sweetviz.graph.COLOR_TARGET_SOURCE)

                target_values_compare = list()
                if to_process.compare is not None and \
                        to_process.compare_target is not None:
                    # TARGET BOOL: with compare TARGET
                    for name in tick_names:
                        if name == OTHERS_GROUPED:
                            tick_num = sv_math.count_fraction_of_true(to_process.compare_target[ \
                                ~to_process.compare.isin(names_excluding_others)])[0]
                        else:
                            tick_num = sv_math.count_fraction_of_true(to_process.compare_target[ \
                                to_process.compare == name])[0]
                        target_values_compare.append(tick_num)
                        # target_values_compare.append(tick_num * matched_data_series[name])
                    axs.plot(target_values_compare,
                             category_centers,
                             marker='o',
                             color=sweetviz.graph.COLOR_TARGET_COMPARE)
                # else:
                #     # TARGET BOOL: NO compare TARGET -> Just fill with zeros so alignment is still good
                #     for name in tick_names:
                #         target_values_compare.append(0.0)
                # target_plot_series = [target_values_source, target_values_compare]
                # plot_grouped_bars(tick_names, target_plot_series, ('k','k'), gap_percent,
                #                   orientation='horizontal', axis_obj=axs, alpha=0.6)

        # Finalize Graph
        # -----------------------------
        # Needs only ~5 on right, but want to match num
        if which_graph == "mini":
            needed_pixels_padding = np.array([14.0, (300 + 32), 14,
                                              45])  # TOP-LEFT-BOTTOM-RIGHT
        else:
            needed_pixels_padding = np.array([14.0, 140, 16,
                                              45])  # TOP-LEFT-BOTTOM-RIGHT

        padding_fraction = needed_pixels_padding
        padding_fraction[0] = padding_fraction[0] / (self.size_in_inches[1] *
                                                     f.dpi)
        padding_fraction[2] = padding_fraction[2] / (self.size_in_inches[1] *
                                                     f.dpi)
        padding_fraction[3] = padding_fraction[3] / (self.size_in_inches[0] *
                                                     f.dpi)
        padding_fraction[1] = padding_fraction[1] / (self.size_in_inches[0] *
                                                     f.dpi)
        plt.subplots_adjust(top=(1.0 - padding_fraction[0]), left=padding_fraction[1], \
                bottom=padding_fraction[2], right=(1.0 - padding_fraction[3]))

        self.graph_base64 = self.get_encoded_base64(f)
        plt.close('all')
Esempio n. 7
0
    def __init__(self, which_graph: str, to_process: FeatureToProcess):
        if to_process.is_target() and which_graph == "mini":
            styles = ["graph_base.mplstyle", "graph_target.mplstyle"]
        else:
            styles = ["graph_base.mplstyle"]
        self.set_style(styles)

        is_detail = which_graph.find("detail") != -1
        if which_graph == "mini":
            f, axs = plt.subplots(1, 1, \
                                  figsize=(config["Graphs"].getfloat("num_summary_graph_width"),
                                           config["Graphs"].getfloat("summary_graph_height")))
            self.num_bins = None
        elif is_detail:
            f, axs = plt.subplots(1, 1, \
                                  figsize=(config["Graphs"].getfloat("detail_graph_width"),
                                           config["Graphs"].getfloat("detail_graph_height_numeric")))
            split = which_graph.split("-")
            self.index_for_css = split[1]
            self.num_bins = int(split[1])
            self.button_name = self.index_for_css
            # 0 is "auto"
            if self.num_bins == 0:
                self.num_bins = None
                self.button_name = "Auto"
        else:
            raise ValueError

        axs.tick_params(axis='x',
                        direction='out',
                        pad=2,
                        labelsize=8,
                        length=2)
        axs.tick_params(axis='y',
                        direction='out',
                        pad=2,
                        labelsize=8,
                        length=2)
        axs.xaxis.set_major_formatter(mtick.FuncFormatter(self.format_smart))
        axs.yaxis.set_major_formatter(
            mtick.PercentFormatter(xmax=1.0, decimals=0))

        # MAIN DATA ("Under" target)
        # ---------------------------------------------
        np.seterr(all='raise')
        # WORKAROUND histogram warnings
        cleaned_source = to_process.source[~np.isnan(to_process.source)]
        if len(cleaned_source):
            norm_source = np.full(len(cleaned_source),
                                  1.0 / len(cleaned_source))
        else:
            norm_source = []
        if to_process.compare is not None:
            # COMPARE
            cleaned_compare = to_process.compare[~np.isnan(to_process.compare)]
            plot_data = (cleaned_source, cleaned_compare)
            if len(cleaned_compare):
                norm_compare = np.full(len(cleaned_compare),
                                       1.0 / len(cleaned_compare))
            else:
                norm_compare = []
            normalizing_weights = (norm_source, norm_compare)

        else:
            plot_data = cleaned_source
            normalizing_weights = norm_source

        gap_percent = config["Graphs"].getfloat(
            "summary_graph_categorical_gap")

        self.hist_specs = axs.hist(plot_data, weights = normalizing_weights, bins=self.num_bins, \
                                   rwidth = (100.0 - gap_percent) / 100.0)
        bin_limits = self.hist_specs[1]
        num_bins = len(bin_limits) - 1
        bin_counts = self.hist_specs[0]

        # Format x ticks
        x_ticks = plt.xticks()
        # tick_range = max(x_ticks[0]) - min(x_ticks[0])
        new_labels = [
            sv_html_formatters.fmt_smart_range_tight(val, max(x_ticks[0]))
            for val in x_ticks[0]
        ]
        plt.xticks(x_ticks[0], new_labels)

        # TARGET
        # ---------------------------------------------
        if to_process.source_target is not None:
            if to_process.predetermined_type_target == FeatureType.TYPE_NUM:
                # TARGET: IS NUMERIC
                # Create a series where each item indicates its bin
                # TODO: possible 1-off bug in counts from cut in lower bin
                source_bins_series = pd.cut(to_process.source,
                                            bins=bin_limits,
                                            labels=False)
                # Create empty bin_averages, then fill in with values
                bin_averages = [None] * num_bins
                for b in range(0, num_bins):
                    bin_averages[b] = \
                        to_process.source_target[source_bins_series == b].mean()

                # TODO: verify number of bins
                bin_offset_x = (bin_limits[1] - bin_limits[0]) / 2.0
                ax2 = axs.twinx()
                ax2.yaxis.set_major_formatter(
                    mtick.FuncFormatter(self.format_smart))
                ax2.plot(bin_limits[:-1] + bin_offset_x, bin_averages, \
                         marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE)

                if to_process.compare is not None and \
                        to_process.compare_target is not None:
                    # TARGET NUMERIC: with compare TARGET
                    compare_bins_series = pd.cut(to_process.compare,
                                                 bins=bin_limits,
                                                 labels=False)
                    bin_averages = [None] * num_bins
                    for b in range(0, num_bins):
                        bin_averages[b] = \
                            to_process.compare_target[compare_bins_series == b].mean()
                    ax2.plot(bin_limits[:-1] + bin_offset_x, bin_averages, \
                             marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE)
            elif to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
                # TARGET: IS BOOL
                source_true = to_process.source[to_process.source_target == 1]
                source_bins_series = pd.cut(source_true,
                                            bins=bin_limits,
                                            labels=False)
                total_counts_source = bin_counts[
                    0] if to_process.compare is not None else bin_counts
                total_counts_source = total_counts_source * len(cleaned_source)
                bin_true_counts_source = [None] * num_bins
                for b in range(0, num_bins):
                    if total_counts_source[b] > 0:
                        bin_true_counts_source[b] = \
                            source_true[source_bins_series == b].count() \
                            / total_counts_source[b]
                    else:
                        bin_true_counts_source[b] = None
                # TODO: verify number of bins
                bin_offset_x = (bin_limits[1] - bin_limits[0]) / 2.0
                # bin_offset_x = 0

                # Share % axis
                # ax2 = axs.twinx()
                ax2 = axs
                ax2.yaxis.set_major_formatter(
                    mtick.PercentFormatter(xmax=1.0, decimals=0))
                ax2.plot(bin_limits[:-1] + bin_offset_x, bin_true_counts_source, \
                         marker='o', color=sweetviz.graph.COLOR_TARGET_SOURCE)

                if to_process.compare is not None and \
                        to_process.compare_target is not None:
                    # TARGET BOOL: with compare TARGET
                    compare_true = to_process.compare[to_process.compare_target
                                                      == 1]

                    # Create a series where each item indicates its bin
                    # TODO: possible 1-off bug in counts from cut in lower bin
                    compare_bins_series = pd.cut(compare_true,
                                                 bins=bin_limits,
                                                 labels=False)
                    total_counts_compare = bin_counts[1] * len(cleaned_compare)
                    bin_true_counts_compare = [None] * num_bins
                    for b in range(0, num_bins):
                        if total_counts_compare[b] > 0:
                            bin_true_counts_compare[b] = \
                                compare_true[compare_bins_series == b].count() \
                                    / total_counts_compare[b]
                        else:
                            bin_true_counts_compare[b] = None

                    ax2.plot(bin_limits[:-1] + bin_offset_x, bin_true_counts_compare, \
                             marker='o', color=sweetviz.graph.COLOR_TARGET_COMPARE)
                ax2.set_ylim([0, None])

                # elif to_process.compare is not None:
                #     # TARGET BOOL: only on source, but there's a compare
                #     source_true = to_process.source[to_process.source_target == 1]
                #     normalizing_weights = np.full(len(source_true),
                #                                    1.0 / len(to_process.source))
                #     b, x, patches = axs.hist(to_process.source[to_process.source_target == 1],
                #              bins = bin_limits, color = ("k"), alpha = 0.8,
                #              weights = normalizing_weights, rwidth = 0.4)
                #
                #     # Make positions of target patches match original patches
                #     for target_patch, source_patch in zip(patches, self.hist_specs[2][0]):
                #         target_patch.set_x(source_patch.get_x())
                #
                #         # Values
                #         if is_detail:
                #             axs.annotate(f'{int(source_patch.get_height())}', xy=(source_patch.get_x() +
                #                                                                   source_patch.get_width() / 2, source_patch.get_height()),
                #                     xytext=(0, 5), textcoords='offset points', ha='center', va='bottom')
                # else:
                #     # TARGET BOOL: with only a source
                #     source_true = to_process.source[to_process.source_target == 1]
                #     normalizing_weights = np.full(len(source_true),
                #                                    1.0 / len(to_process.source))
                #     axs.hist(source_true, bins = bin_limits,
                #              color = 'k', alpha = 0.8, weights = normalizing_weights)
            else:
                raise ValueError

        # Finalize Graph
        # -----------------------------
        self.size_in_inches = f.get_size_inches()
        if which_graph == "mini":
            needed_pixels_padding = np.array([4.0, 32, 15,
                                              45])  # TOP-LEFT-BOTTOM-RIGHT
        else:
            needed_pixels_padding = np.array([5.0, 32, 15,
                                              45])  # TOP-LEFT-BOTTOM-RIGHT
        padding_fraction = needed_pixels_padding
        padding_fraction[0] = padding_fraction[0] / (self.size_in_inches[1] *
                                                     f.dpi)
        padding_fraction[2] = padding_fraction[2] / (self.size_in_inches[1] *
                                                     f.dpi)
        padding_fraction[3] = padding_fraction[3] / (self.size_in_inches[0] *
                                                     f.dpi)
        padding_fraction[1] = padding_fraction[1] / (self.size_in_inches[0] *
                                                     f.dpi)
        plt.subplots_adjust(top=(1.0 - padding_fraction[0]), left=padding_fraction[1], \
                bottom=padding_fraction[2], right=(1.0 - padding_fraction[3]))
        self.graph_base64 = self.get_encoded_base64(f)
        plt.close('all')
        #plt.close(f)
        # print(matplotlib.rcParams)
        return