Ejemplo n.º 1
0
    def __init__(self,
                 source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
                 target_feature_name: str = None,
                 compare: Union[pd.DataFrame, Tuple[pd.DataFrame, str]] = None,
                 pairwise_analysis: str = 'auto',
                 fc: FeatureConfig = None):
        pairwise_analysis = pairwise_analysis.lower()
        if pairwise_analysis not in ["on", "auto", "off"]:
            raise ValueError('"pairwise_analysis" parameter should be one of: "on", "auto", "off"')

        sv_html.load_layout_globals_from_config()

        self._jupyter_html = ""
        self._page_html = ""
        self._features = dict()
        self.compare_name = None
        self._target = None
        self.test_mode = False
        if fc is None:
            fc = FeatureConfig()

        # Associations: _associations[FEATURE][GIVES INFORMATION ABOUT THIS FEATURE]
        self._associations = dict()
        self._associations_compare = dict()
        self._association_graphs = dict()
        self._association_graphs_compare = dict()

        # Handle source and compare dataframes and names
        if type(source) == pd.DataFrame:
            source_df = source
            self.source_name = "DataFrame"
        elif type(source) == list or type(source) == tuple:
            if len(source) != 2:
                raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
            source_df = source[0]
            self.source_name = source[1]
        else:
            raise ValueError('"source" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
        if len(su.get_duplicate_cols(source_df)) > 0:
            raise ValueError('Duplicate column names detected in "source"; this is not supported.')

        # NEW (12-14-2020): Rename indices that use the reserved name "index"
        # From pandas-profiling:
        # If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column} to be `df_index`.
        if 'index' in source_df.columns:
            source_df = source_df.rename(columns={"index": "df_index"})
            if target_feature_name == 'index':
                target_feature_name = 'df_index'

        all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
        if compare is None:
            compare_df = None
            self.compare_name = None
            all_compare_names = list()
        elif type(compare) == pd.DataFrame:
            compare_df = compare
            if 'index' in compare_df.columns:
                compare_df = compare_df.rename(columns={"index": "df_index"})
            self.compare_name = "Compared"
            all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()]
        elif type(compare) == list or type(compare) == tuple:
            if len(compare) != 2:
                raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')
            compare_df = compare[0]
            if 'index' in compare_df.columns:
                compare_df = compare_df.rename(columns={"index": "df_index"})
            self.compare_name = compare[1]
            all_compare_names = [cur_name for cur_name, cur_series in compare_df.iteritems()]
        else:
            raise ValueError('"compare" parameter should either be a string or a list of 2 elements: [dataframe, "Name"].')

        # Validate some params
        if compare_df is not None and len(su.get_duplicate_cols(compare_df)) > 0:
            raise ValueError('Duplicate column names detected in "compare"; this is not supported.')


        if target_feature_name in fc.skip:
            raise ValueError(f'"{target_feature_name}" was also specified as "skip". Target cannot be skipped.')

        for key in fc.get_all_mentioned_features():
            if key not in all_source_names:
                raise ValueError(f'"{key}" was specified in "feature_config" but is not found in source dataframe (watch case-sensitivity?).')

        # Find Features and Target (FILTER SKIPPED)
        filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()
                                           if cur_name not in fc.skip]
        for skipped in fc.skip:
            if skipped not in all_source_names and skipped not in all_compare_names:
                raise ValueError(f'"{skipped}" was marked as "skip" but is not in any provided dataframe (watch case-sensitivity?).')

        # Progress bar setup
        ratio_progress_of_df_summary_vs_feature = 1.0
        number_features = len(filtered_series_names_in_source)
        exponential_checks = number_features * number_features
        progress_chunks = ratio_progress_of_df_summary_vs_feature \
                            + number_features + (0 if target_feature_name is not None else 0)

        self.progress_bar = tqdm(total=progress_chunks, bar_format= \
                '{desc:45}|{bar}| [{percentage:3.0f}%]   {elapsed} -> ({remaining} left)', \
                ascii=False, dynamic_ncols=True, position=0, leave= True)

        # Summarize dataframe
        self.progress_bar.set_description_str("[Summarizing dataframe]")
        self.summary_source = dict()
        self.summarize_dataframe(source_df, self.source_name, self.summary_source, fc.skip)
        # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!!
        # if target_feature_name:
        #     self.summary_source["num_columns"] = self.summary_source["num_columns"] - 1
        if compare_df is not None:
            self.summary_compare = dict()
            self.summarize_dataframe(compare_df, self.compare_name, self.summary_compare, fc.skip)
            cmp_not_in_src = \
                [name for name in all_compare_names if name not in all_source_names]
            self.summary_compare["num_cmp_not_in_source"] = len(cmp_not_in_src)
            # UPDATE 2021-02-05: Count the target has an actual feature!!! It is!!!
            # if target_feature_name:
            #     if target_feature_name in compare_df.columns:
            #         self.summary_compare["num_columns"] = self.summary_compare["num_columns"] - 1
        else:
            self.summary_compare = None
        self.progress_bar.update(ratio_progress_of_df_summary_vs_feature)

        self.num_summaries = number_features

        # Association check
        if pairwise_analysis == 'auto' and \
                number_features > config["Processing"].getint("association_auto_threshold"):
            print(f"PAIRWISE CALCULATION LENGTH WARNING: There are {number_features} features in "
                  f"this dataframe and the "
                  f"'pairwise_analysis' parameter is set to 'auto'.\nPairwise analysis is exponential in "
                  f"length: {number_features} features will cause ~"
                  f"{number_features * number_features} pairs to be "
                  f"evaluated, which could take a long time.\n\nYou must call the function with the "
                  f"parameter pairwise_analysis='on' or 'off' to explicitly select desired behavior."
                  )
            self.progress_bar.close()
            return

        # Validate and process TARGET
        target_to_process = None
        target_type = None
        if target_feature_name:
            # Make sure target exists
            self.progress_bar.set_description_str(f"Feature: {target_feature_name} (TARGET)")
            targets_found = [item for item in filtered_series_names_in_source
                             if item == target_feature_name]
            if len(targets_found) == 0:
                self.progress_bar.close()
                raise KeyError(f"Feature '{target_feature_name}' was "
                               f"specified as TARGET, but is NOT FOUND in "
                               f"the dataframe (watch case-sensitivity?).")

            # Make sure target has no nan's
            if source_df[targets_found[0]].isnull().values.any():
                self.progress_bar.close()
                raise ValueError(f"\nTarget feature '{targets_found[0]}' contains NaN (missing) values.\n"
                               f"To avoid confusion in interpreting target distribution,\n"
                               f"target features MUST NOT have any missing values at this time.\n")

            # Find Target in compared, if present
            compare_target_series = None
            if compare_df is not None:
                if target_feature_name in compare_df.columns:
                    if compare_df[target_feature_name].isnull().values.any():
                        self.progress_bar.close()
                        raise ValueError(
                            f"\nTarget feature '{target_feature_name}' in COMPARED data contains NaN (missing) values.\n"
                            f"To avoid confusion in interpreting target distribution,\n"
                            f"target features MUST NOT have any missing values at this time.\n")
                    compare_target_series = compare_df[target_feature_name]

            # TARGET processed HERE with COMPARE if present
            target_to_process = FeatureToProcess(-1, source_df[targets_found[0]], compare_target_series,
                                                 None, None, fc.get_predetermined_type(targets_found[0]))
            self._target = sa.analyze_feature_to_dictionary(target_to_process)
            filtered_series_names_in_source.remove(targets_found[0])
            target_type = self._target["type"]
            self.progress_bar.update(1)

        # Set final target series and sanitize targets (e.g. bool->truly bool)
        source_target_series = None
        compare_target_series = None
        if target_feature_name:
            if target_feature_name not in source_df.columns:
                raise ValueError
            if self._target["type"] == sa.FeatureType.TYPE_BOOL:
                source_target_series = self.get_sanitized_bool_series(source_df[target_feature_name])
            else:
                source_target_series = source_df[target_feature_name]

            if compare_df is not None:
                if target_feature_name in compare_df.columns:
                    if self._target["type"] == sa.FeatureType.TYPE_BOOL:
                        compare_target_series = self.get_sanitized_bool_series(compare_df[
                                                                                   target_feature_name])
                    else:
                        compare_target_series = compare_df[target_feature_name]

        # Create list of features to process
        features_to_process = []
        for cur_series_name, cur_order_index in zip(filtered_series_names_in_source,
                                                 range(0, len(filtered_series_names_in_source))):
            # TODO: BETTER HANDLING OF DIFFERENT COLUMNS IN SOURCE/COMPARE
            if compare_df is not None and cur_series_name in \
                    compare_df.columns:
                this_feat = FeatureToProcess(cur_order_index,
                                             source_df[cur_series_name],
                                             compare_df[cur_series_name],
                                             source_target_series,
                                             compare_target_series,
                                             fc.get_predetermined_type(cur_series_name),
                                             target_type)
            else:
                this_feat = FeatureToProcess(cur_order_index,
                                             source_df[cur_series_name],
                                             None,
                                             source_target_series,
                                             None,
                                             fc.get_predetermined_type(cur_series_name),
                                             target_type)
            features_to_process.append(this_feat)


        # Process columns -> features
        self.run_id = hex(int(time.time()))[2:] + "_" # removes the decimals
        # self.temp_folder = config["Files"].get("temp_folder")
        # os.makedirs(os.path.normpath(self.temp_folder), exist_ok=True)

        for f in features_to_process:
            # start = time.perf_counter()
            self.progress_bar.set_description_str(f"Feature: {f.source.name}")
            self._features[f.source.name] = sa.analyze_feature_to_dictionary(f)
            self.progress_bar.update(1)
            # print(f"DONE FEATURE------> {f.source.name}"
            #       f" {(time.perf_counter() - start):.2f}   {self._features[f.source.name]['type']}")
        # self.progress_bar.set_description_str('[FEATURES DONE]')
        # self.progress_bar.close()

        # Wrap up summary
        self.summarize_category_types(source_df, self.summary_source, fc.skip, self._target)
        if compare is not None:
            self.summarize_category_types(compare_df, self.summary_compare, fc.skip, self._target)
        self.dataframe_summary_html = sv_html.generate_html_dataframe_summary(self)

        self.graph_legend = GraphLegend(self)

        # Process all associations
        # ----------------------------------------------------
        # Put target first
        if target_to_process is not None:
            features_to_process.insert(0,target_to_process)

        if pairwise_analysis.lower() != 'off':
            self.progress_bar.reset(total=len(features_to_process))
            self.progress_bar.set_description_str("[Step 2/3] Processing Pairwise Features")
            self.process_associations(features_to_process, source_target_series, compare_target_series)

            self.progress_bar.reset(total=1)
            self.progress_bar.set_description_str("[Step 3/3] Generating associations graph")
            self.associations_html_source = True # Generated later in the process
            self.associations_html_compare = True # Generated later in the process
            self._association_graphs["all"] = GraphAssoc(self, "all", self._associations)
            self._association_graphs_compare["all"] = GraphAssoc(self, "all", self._associations_compare)
            self.progress_bar.set_description_str("Done! Use 'show' commands to display/save. ")
            self.progress_bar.update(1)
        else:
            self._associations = None
            self._associations_compare = None
            self.associations_html_source = None
            self.associations_html_compare = None
        self.progress_bar.close()
        return
Ejemplo n.º 2
0
import sweetviz.series_analyzer as sa
#from sweetviz.config import config
import pickle
import time

#temp_folder = config["Files"].get("temp_folder")

# full_path_to_pickled = "../sweetviz-temp/5e52a452__click_id.pkl"
full_path_to_pickled = sys.argv[1]
with open(full_path_to_pickled, 'rb') as handle:
    feature_to_process = pickle.load(handle)
# start = time.perf_counter()

# print("OHHHHH:" + str(feature_to_process))
#print("OHHHHH:")
analysis_dictionary = sa.analyze_feature_to_dictionary(feature_to_process)
#analysis_dictionary = dict()
#print(analysis_dictionary)


split_source_path = os.path.split(full_path_to_pickled)
full_path_to_pickled_out = os.path.join(split_source_path[0],
                                        os.path.splitext(
                                            split_source_path[1])[0]
                                        + "_out.pkl")

with open(full_path_to_pickled_out, 'wb') as handle:
   pickle.dump(analysis_dictionary, handle)
# print(f"PROCESS------> {feature_to_process.source.name}"
#       f" {time.perf_counter() - start}")