def __init__( self, clf, scoring="roc_auc", test_prc=0.25, n_jobs=1, stats_tests_to_apply=None, verbose=0, random_state=None, ): """ Initializes the class. Args: clf (model object): Binary classification model or pipeline. scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional): Metrics for which the score is calculated. It can be either a name or list of names metric names and needs to be aligned with predefined classification scorers names in sklearn ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)). Another option is using probatus.utils.Scorer to define a custom metric. test_prc (float, optional): Percentage of input data used as test. By default 0.25. n_jobs (int, optional): Number of parallel executions. If -1 use all available cores. By default 1. stats_tests_to_apply (None, string or list of strings, optional): List of tests to apply. Available options: - `'ES'`: Epps-Singleton, - `'KS'`: Kolmogorov-Smirnov statistic, - `'PSI'`: Population Stability Index, - `'SW'`: Shapiro-Wilk based difference statistic, - `'AD'`: Anderson-Darling TS. verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings and indication of progress in fitting the object. - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). random_state (int, optional): Random state set at each round of feature elimination. If it is None, the results will not be reproducible and in random search at each iteration a different hyperparameters might be tested. For reproducible results set it to integer. """ self.clf = clf self.n_jobs = n_jobs self.random_state = random_state self.test_prc = test_prc self.iterations_results = None self.report = None self.verbose = verbose self.allowed_stats_tests = list(DistributionStatistics.statistical_test_dict.keys()) # TODO set reasonable default value for the parameter, to choose the statistical test for the user for different # ways to compute volatility if stats_tests_to_apply is not None: self.stats_tests_to_apply = assure_list_of_strings(stats_tests_to_apply, "stats_tests_to_apply") assure_list_values_allowed( variable=self.stats_tests_to_apply, variable_name="stats_tests_to_apply", allowed_values=self.allowed_stats_tests, ) else: self.stats_tests_to_apply = [] self.stats_tests_objects = [] if len(self.stats_tests_to_apply) > 0: if self.verbose > 0: warnings.warn( "Computing statistics for distributions is an experimental feature. While using it, keep " "in mind that the samples of metrics might be correlated." ) for test_name in self.stats_tests_to_apply: self.stats_tests_objects.append(DistributionStatistics(statistical_test=test_name)) self.scorers = get_scorers(scoring)
def plot( self, plot_type, target_set="test", target_columns=None, samples_index=None, show=True, **plot_kwargs, ): """ Plots the appropriate SHAP plot Args: plot_type (str): One of the following: - `'importance'`: Feature importance plot, SHAP bar summary plot - `'summary'`: SHAP Summary plot - `'dependence'`: Dependence plot for each feature - `'sample'`: Explanation of a given sample in the test data target_set (str, optional): The set for which the plot should be generated, either `train` or `test` set. We recommend using test set, because it is not biased by model training. The train set plots are mainly used to compare with the test set plots, whether there are significant differences, which indicate shift in data distribution. target_columns (None, str or list of str, optional): List of features names, for which the plots should be generated. If None, all features will be plotted. samples_index (None, int, list or pd.Index, optional): Index of samples to be explained if the `plot_type=sample`. show (bool, optional): If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, when you want to edit the returned axis, before showing it. **plot_kwargs: Keyword arguments passed to the plot method. For 'importance' and 'summary' plot_type, the kwargs are passed to shap.summary_plot, for 'dependence' plot_type, they are passed to probatus.interpret.TreeDependencePlotter.feature_plot method. Returns: (matplotlib.axes or list(matplotlib.axes)): An Axes with the plot, or list of axes when multiple plots are returned. """ # Choose correct columns if target_columns is None: target_columns = self.column_names target_columns = assure_list_of_strings(target_columns, "target_columns") target_columns_indices = [ self.column_names.index(target_column) for target_column in target_columns ] # Choose the correct dataset if target_set == "test": target_X = self.X_test target_shap_values = self.shap_values_test target_tdp = self.tdp_train target_expected_value = self.expected_value_train elif target_set == "train": target_X = self.X_train target_shap_values = self.shap_values_train target_tdp = self.tdp_test target_expected_value = self.expected_value_test else: raise ( ValueError('The target_set parameter can be either "train" or "test".') ) if plot_type in ["importance", "summary"]: target_X = target_X[target_columns] target_shap_values = target_shap_values[:, target_columns_indices] # Set summary plot settings if plot_type == "importance": plot_type = "bar" plot_title = f"SHAP Feature Importance for {target_set} set" else: plot_type = "dot" plot_title = f"SHAP Summary plot for {target_set} set" shap.summary_plot( target_shap_values, target_X, plot_type=plot_type, class_names=self.class_names, show=False, **plot_kwargs, ) ax = plt.gca() ax.set_title(plot_title) ax.annotate( self.results_text, (0, 0), (0, -50), fontsize=12, xycoords="axes fraction", textcoords="offset points", va="top", ) if show: plt.show() else: plt.close() elif plot_type == "dependence": ax = [] for feature_name in target_columns: ax.append( target_tdp.plot(feature=feature_name, figsize=(10, 7), show=show) ) elif plot_type == "sample": # Ensure the correct samples_index type if samples_index is None: raise ( ValueError( "For sample plot, you need to specify the samples_index be plotted plot" ) ) elif isinstance(samples_index, int) or isinstance(samples_index, str): samples_index = [samples_index] elif not ( isinstance(samples_index, list) or isinstance(samples_index, pd.Index) ): raise ( TypeError( "sample_index must be one of the following: int, str, list or pd.Index" ) ) ax = [] for sample_index in samples_index: sample_loc = target_X.index.get_loc(sample_index) shap.plots._waterfall.waterfall_legacy( target_expected_value, target_shap_values[sample_loc, :], target_X.loc[sample_index], show=False, **plot_kwargs, ) plot_title = f"SHAP Sample Explanation of {target_set} sample for index={sample_index}" current_ax = plt.gca() current_ax.set_title(plot_title) ax.append(current_ax) if show: plt.show() else: plt.close() else: raise ValueError( "Wrong plot type, select from 'importance', 'summary', or 'dependence'" ) if isinstance(ax, list) and len(ax) == 1: ax = ax[0] return ax