Ejemplo n.º 1
0
    def __init__(
        self,
        clf,
        scoring="roc_auc",
        test_prc=0.25,
        n_jobs=1,
        stats_tests_to_apply=None,
        verbose=0,
        random_state=None,
    ):
        """
        Initializes the class.

        Args:
            clf (model object):
                Binary classification model or pipeline.

            scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional):
                Metrics for which the score is calculated. It can be either a name or list of names metric names and
                needs to be aligned with predefined classification scorers names in sklearn
                 ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)).
                Another option is using probatus.utils.Scorer to define a custom metric.

            test_prc (float, optional):
                Percentage of input data used as test. By default 0.25.

            n_jobs (int, optional):
                Number of parallel executions. If -1 use all available cores. By default 1.

            stats_tests_to_apply (None, string or list of strings, optional):
                List of tests to apply. Available options:

                - `'ES'`: Epps-Singleton,
                - `'KS'`: Kolmogorov-Smirnov statistic,
                - `'PSI'`: Population Stability Index,
                - `'SW'`: Shapiro-Wilk based difference statistic,
                - `'AD'`: Anderson-Darling TS.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - nether prints nor warnings are shown
                - 1 - 50 - only most important warnings and indication of progress in fitting the object.
                - 51 - 100 - shows other warnings and prints
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            random_state (int, optional):
                Random state set at each round of feature elimination. If it is None, the results will not be
                reproducible and in random search at each iteration a different hyperparameters might be tested. For
                reproducible results set it to integer.
        """
        self.clf = clf
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.test_prc = test_prc
        self.iterations_results = None
        self.report = None
        self.verbose = verbose
        self.allowed_stats_tests = list(DistributionStatistics.statistical_test_dict.keys())

        # TODO set reasonable default value for the parameter, to choose the statistical test for the user for different
        #  ways to compute volatility
        if stats_tests_to_apply is not None:
            self.stats_tests_to_apply = assure_list_of_strings(stats_tests_to_apply, "stats_tests_to_apply")
            assure_list_values_allowed(
                variable=self.stats_tests_to_apply,
                variable_name="stats_tests_to_apply",
                allowed_values=self.allowed_stats_tests,
            )
        else:
            self.stats_tests_to_apply = []

        self.stats_tests_objects = []
        if len(self.stats_tests_to_apply) > 0:
            if self.verbose > 0:
                warnings.warn(
                    "Computing statistics for distributions is an experimental feature. While using it, keep "
                    "in mind that the samples of metrics might be correlated."
                )
            for test_name in self.stats_tests_to_apply:
                self.stats_tests_objects.append(DistributionStatistics(statistical_test=test_name))

        self.scorers = get_scorers(scoring)
    def plot(
        self,
        plot_type,
        target_set="test",
        target_columns=None,
        samples_index=None,
        show=True,
        **plot_kwargs,
    ):
        """
        Plots the appropriate SHAP plot

        Args:
            plot_type (str):
                One of the following:

                - `'importance'`: Feature importance plot, SHAP bar summary plot
                - `'summary'`: SHAP Summary plot
                - `'dependence'`: Dependence plot for each feature
                - `'sample'`: Explanation of a given sample in the test data

            target_set (str, optional):
                The set for which the plot should be generated, either `train` or `test` set. We recommend using test
                set, because it is not biased by model training. The train set plots are mainly used to compare with the
                test set plots, whether there are significant differences, which indicate shift in data distribution.

            target_columns (None, str or list of str, optional):
                List of features names, for which the plots should be generated. If None, all features will be plotted.

            samples_index (None, int, list or pd.Index, optional):
                Index of samples to be explained if the `plot_type=sample`.

            show (bool, optional):
                If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful,
                when you want to edit the returned axis, before showing it.

            **plot_kwargs:
                Keyword arguments passed to the plot method. For 'importance' and 'summary' plot_type, the kwargs are
                passed to shap.summary_plot, for 'dependence' plot_type, they are passed to
                probatus.interpret.TreeDependencePlotter.feature_plot method.

        Returns:
            (matplotlib.axes or list(matplotlib.axes)):
                An Axes with the plot, or list of axes when multiple plots are returned.
        """
        # Choose correct columns
        if target_columns is None:
            target_columns = self.column_names

        target_columns = assure_list_of_strings(target_columns, "target_columns")
        target_columns_indices = [
            self.column_names.index(target_column) for target_column in target_columns
        ]

        # Choose the correct dataset
        if target_set == "test":
            target_X = self.X_test
            target_shap_values = self.shap_values_test
            target_tdp = self.tdp_train
            target_expected_value = self.expected_value_train
        elif target_set == "train":
            target_X = self.X_train
            target_shap_values = self.shap_values_train
            target_tdp = self.tdp_test
            target_expected_value = self.expected_value_test
        else:
            raise (
                ValueError('The target_set parameter can be either "train" or "test".')
            )

        if plot_type in ["importance", "summary"]:
            target_X = target_X[target_columns]
            target_shap_values = target_shap_values[:, target_columns_indices]
            # Set summary plot settings
            if plot_type == "importance":
                plot_type = "bar"
                plot_title = f"SHAP Feature Importance for {target_set} set"
            else:
                plot_type = "dot"
                plot_title = f"SHAP Summary plot for {target_set} set"

            shap.summary_plot(
                target_shap_values,
                target_X,
                plot_type=plot_type,
                class_names=self.class_names,
                show=False,
                **plot_kwargs,
            )
            ax = plt.gca()
            ax.set_title(plot_title)

            ax.annotate(
                self.results_text,
                (0, 0),
                (0, -50),
                fontsize=12,
                xycoords="axes fraction",
                textcoords="offset points",
                va="top",
            )
            if show:
                plt.show()
            else:
                plt.close()
        elif plot_type == "dependence":
            ax = []
            for feature_name in target_columns:
                ax.append(
                    target_tdp.plot(feature=feature_name, figsize=(10, 7), show=show)
                )

        elif plot_type == "sample":
            # Ensure the correct samples_index type
            if samples_index is None:
                raise (
                    ValueError(
                        "For sample plot, you need to specify the samples_index be plotted plot"
                    )
                )
            elif isinstance(samples_index, int) or isinstance(samples_index, str):
                samples_index = [samples_index]
            elif not (
                isinstance(samples_index, list) or isinstance(samples_index, pd.Index)
            ):
                raise (
                    TypeError(
                        "sample_index must be one of the following: int, str, list or pd.Index"
                    )
                )

            ax = []
            for sample_index in samples_index:
                sample_loc = target_X.index.get_loc(sample_index)

                shap.plots._waterfall.waterfall_legacy(
                    target_expected_value,
                    target_shap_values[sample_loc, :],
                    target_X.loc[sample_index],
                    show=False,
                    **plot_kwargs,
                )

                plot_title = f"SHAP Sample Explanation of {target_set} sample for index={sample_index}"
                current_ax = plt.gca()
                current_ax.set_title(plot_title)
                ax.append(current_ax)
                if show:
                    plt.show()
                else:
                    plt.close()
        else:
            raise ValueError(
                "Wrong plot type, select from 'importance', 'summary', or 'dependence'"
            )

        if isinstance(ax, list) and len(ax) == 1:
            ax = ax[0]
        return ax