def compute(self, return_scores=False): """ Computes the DataFrame, that presents the importance of each feature. Args: return_scores (bool, optional): Flag indicating whether the method should return the train and test score of the model, together with the model interpretation report. If true, the output of this method is a tuple of DataFrame, float, float. Returns: (pd.DataFrame or tuple(pd.DataFrame, float, float)): Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the model. """ self._check_if_fitted() # Compute SHAP importance self.importance_df_train = calculate_shap_importance( self.shap_values_train, self.column_names, output_columns_suffix="_train" ) self.importance_df_test = calculate_shap_importance( self.shap_values_test, self.column_names, output_columns_suffix="_test" ) # Concatenate the train and test, sort by test set importance and reorder the columns self.importance_df = pd.concat( [self.importance_df_train, self.importance_df_test], axis=1 ).sort_values("mean_abs_shap_value_test", ascending=False)[ [ "mean_abs_shap_value_test", "mean_abs_shap_value_train", "mean_shap_value_test", "mean_shap_value_train", ] ] if return_scores: return self.importance_df, self.train_score, self.test_score else: return self.importance_df
def fit(self, X, y, column_names=None): """ Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance is calculated using Cross-Validation, and `step` lowest importance features are removed. Args: X (pd.DataFrame): Provided dataset. y (pd.Series): Binary labels for X. column_names (list of str, optional): List of feature names of the provided samples. If provided it will be used to overwrite the existing feature names. If not provided the existing feature names are used or default feature names are generated. Returns: (ShapRFECV): Fitted object. """ # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) self.X, self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose) self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose) self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf)) remaining_features = current_features_set = self.column_names round_number = 0 while len(current_features_set) > self.min_features_to_select: round_number += 1 # Get current dataset info current_features_set = remaining_features current_X = self.X[current_features_set] # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) # Optimize parameters if self.search_clf: current_search_clf = clone(self.clf).fit(current_X, self.y) current_clf = current_search_clf.estimator.set_params( **current_search_clf.best_params_) else: current_clf = clone(self.clf) # Perform CV to estimate feature importance with SHAP results_per_fold = Parallel(n_jobs=self.n_jobs)( delayed(self._get_feature_shap_values_per_fold)( X=current_X, y=self.y, clf=current_clf, train_index=train_index, val_index=val_index, scorer=self.scorer.scorer, verbose=self.verbose) for train_index, val_index in self.cv.split(current_X, self.y)) shap_values = np.vstack( [current_result[0] for current_result in results_per_fold]) scores_train = [ current_result[1] for current_result in results_per_fold ] scores_val = [ current_result[2] for current_result in results_per_fold ] shap_importance_df = calculate_shap_importance( shap_values, remaining_features) # Get features to remove features_to_remove = self._get_current_features_to_remove( shap_importance_df) remaining_features = list( set(current_features_set) - set(features_to_remove)) # Report results self._report_current_results( round_number=round_number, current_features_set=current_features_set, features_to_remove=features_to_remove, train_metric_mean=np.round(np.mean(scores_train), 3), train_metric_std=np.round(np.std(scores_train), 3), val_metric_mean=np.round(np.mean(scores_val), 3), val_metric_std=np.round(np.std(scores_val), 3)) if self.verbose > 50: print( f'Round: {round_number}, Current number of features: {len(current_features_set)}, ' f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation ' f'{self.report_df.loc[round_number]["val_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n' f'Num of features left: {len(remaining_features)}. ' f'Removed features at the end of the round: {features_to_remove}' ) self.fitted = True return self
def fit(self, X, y, sample_weight=None, columns_to_keep=None, column_names=None, **shap_kwargs): """ Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially eliminates features. If sklearn compatible search CV is passed as clf e.g. [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) or [BayesSearchCV](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html), the hyperparameter optimization is applied at each step of the elimination. Then, the SHAP feature importance is calculated using Cross-Validation, and `step` lowest importance features are removed. Args: X (pd.DataFrame): Provided dataset. y (pd.Series): Binary labels for X. sample_weight (pd.Series, np.ndarray, list, optional): array-like of shape (n_samples,) - only use if the model you're using supports sample weighting (check the corresponding scikit-learn documentation). Array of weights that are assigned to individual samples. Note that they're only used for fitting of the model, not during evaluation of metrics. If not provided, then each sample is given unit weight. columns_to_keep (list of str, optional): List of column names to keep. If given, these columns will not be eliminated by the feature elimination process. However, these feature will used for the calculation of the SHAP values. column_names (list of str, optional): List of feature names of the provided samples. If provided it will be used to overwrite the existing feature names. If not provided the existing feature names are used or default feature names are generated. **shap_kwargs: keyword arguments passed to [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer). It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values. The `approximate=True` causes less accurate, but faster SHAP values calculation, while `check_additivity=False` disables the additivity check inside SHAP. Returns: (ShapRFECV): Fitted object. """ # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) # If to columns_to_keep is not provided, then initialise it by an empty string. # If provided check if all the elements in columns_to_keep are of type string. if columns_to_keep is None: len_columns_to_keep = 0 else: if all(isinstance(x, str) for x in columns_to_keep): len_columns_to_keep = len(columns_to_keep) else: raise (ValueError( "The current values of columns_to_keep are not allowed.All the elements should be strings." )) # If the columns_to_keep parameter is provided, check if they match the column names in the X. if column_names is not None: if all(x in column_names for x in list(X.columns)): pass else: raise (ValueError( "The column names in parameter columns_to_keep and column_names are not macthing." )) # Check that the total number of columns to select is less than total number of columns in the data. # only when both parameters are provided. if column_names is not None and columns_to_keep is not None: if (self.min_features_to_select + len_columns_to_keep) > len( self.column_names): raise ValueError( "Minimum features to select is greater than number of features." "Lower the value for min_features_to_select or number of columns in columns_to_keep" ) self.X, self.column_names = preprocess_data(X, X_name="X", column_names=column_names, verbose=self.verbose) self.y = preprocess_labels(y, y_name="y", index=self.X.index, verbose=self.verbose) if sample_weight is not None: if self.verbose > 0: warnings.warn( "sample_weight is passed only to the fit method of the model, not the evaluation metrics." ) sample_weight = assure_pandas_series(sample_weight, index=self.X.index) self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf)) remaining_features = current_features_set = self.column_names round_number = 0 # Stop when stopping criteria is met. stopping_criteria = np.max( [self.min_features_to_select, len_columns_to_keep]) # Setting up the min_features_to_select parameter. if columns_to_keep is None: pass else: self.min_features_to_select = 0 # This ensures that, if columns_to_keep is provided , # the last features remaining are only the columns_to_keep. if self.verbose > 50: warnings.warn( f"Minimum features to select : {stopping_criteria}") while len(current_features_set) > stopping_criteria: round_number += 1 # Get current dataset info current_features_set = remaining_features if columns_to_keep is None: remaining_removeable_features = list(set(current_features_set)) else: remaining_removeable_features = list( set(current_features_set) | set(columns_to_keep)) current_X = self.X[remaining_removeable_features] # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) # Optimize parameters if self.search_clf: current_search_clf = clone(self.clf).fit(current_X, self.y) current_clf = current_search_clf.estimator.set_params( **current_search_clf.best_params_) else: current_clf = clone(self.clf) # Perform CV to estimate feature importance with SHAP results_per_fold = Parallel(n_jobs=self.n_jobs)( delayed(self._get_feature_shap_values_per_fold)( X=current_X, y=self.y, clf=current_clf, train_index=train_index, val_index=val_index, sample_weight=sample_weight, **shap_kwargs, ) for train_index, val_index in self.cv.split(current_X, self.y)) shap_values = np.vstack( [current_result[0] for current_result in results_per_fold]) scores_train = [ current_result[1] for current_result in results_per_fold ] scores_val = [ current_result[2] for current_result in results_per_fold ] # Calculate the shap features with remaining features and features to keep. shap_importance_df = calculate_shap_importance( shap_values, remaining_removeable_features) # Get features to remove features_to_remove = self._get_current_features_to_remove( shap_importance_df, columns_to_keep=columns_to_keep) remaining_features = list( set(current_features_set) - set(features_to_remove)) # Report results self._report_current_results( round_number=round_number, current_features_set=current_features_set, features_to_remove=features_to_remove, train_metric_mean=np.round(np.mean(scores_train), 3), train_metric_std=np.round(np.std(scores_train), 3), val_metric_mean=np.round(np.mean(scores_val), 3), val_metric_std=np.round(np.std(scores_val), 3), ) if self.verbose > 50: print( f"Round: {round_number}, Current number of features: {len(current_features_set)}, " f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation ' f'{self.report_df.loc[round_number]["val_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n' f"Features left: {remaining_features}. " f"Removed features at the end of the round: {features_to_remove}" ) self.fitted = True return self
def fit(self, X, y, columns_to_keep=None, column_names=None): """ Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance is calculated using Cross-Validation, and `step` lowest importance features are removed. Args: X (pd.DataFrame): Provided dataset. y (pd.Series): Binary labels for X. columns_to_keep(list of str,optional): List of column names to keep. If given, these columns will not be eliminated by the feature elimination process. However, these feature will used for the calculation of the SHAP values. column_names (list of str, optional): List of feature names of the provided samples. If provided it will be used to overwrite the existing feature names. If not provided the existing feature names are used or default feature names are generated. Returns: (ShapRFECV): Fitted object. """ # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) # If to columns_to_keep is not provided, then initialise it by an empty string. # If provided check if all the elements in columns_to_keep are of type string. if columns_to_keep is None: len_columns_to_keep = 0 else: if all(isinstance(x, str) for x in columns_to_keep): len_columns_to_keep = len(columns_to_keep) else: raise (ValueError( 'The current values of columns_to_keep are not allowed.All the elements should be strings.' )) # If the columns_to_keep parameter is provided, check if they match the column names in the X. if column_names is not None: if all(x in column_names for x in list(X.columns)): pass else: raise (ValueError( 'The column names in parameter columns_to_keep and column_names are not macthing.' )) #Check that the total number of columns to select is less than total number of columns in the data. #only when both parameters are provided. if column_names is not None and columns_to_keep is not None: if (self.min_features_to_select + len_columns_to_keep) > len( self.column_names): raise ValueError( 'Minimum features to select is greater than number of features.' 'Lower the value for min_features_to_select or number of columns in columns_to_keep' ) self.X, self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose) self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose) self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf)) remaining_features = current_features_set = self.column_names round_number = 0 #Stop when stopping criteria is met. stopping_criteria = np.max( [self.min_features_to_select, len_columns_to_keep]) #Setting up the min_features_to_select parameter. if columns_to_keep is None: pass else: self.min_features_to_select = 0 #This ensures that, if columns_to_keep is provided ,the last features remaining are only the columns_to_keep. if self.verbose > 50: warnings.warn( f'Minimum features to select : {stopping_criteria}') while len(current_features_set) > stopping_criteria: round_number += 1 # Get current dataset info current_features_set = remaining_features if columns_to_keep is None: remaining_removeable_features = list(set(current_features_set)) else: remaining_removeable_features = list( set(current_features_set) | set(columns_to_keep)) current_X = self.X[remaining_removeable_features] # Set seed for results reproducibility if self.random_state is not None: np.random.seed(self.random_state) # Optimize parameters if self.search_clf: current_search_clf = clone(self.clf).fit(current_X, self.y) current_clf = current_search_clf.estimator.set_params( **current_search_clf.best_params_) else: current_clf = clone(self.clf) # Perform CV to estimate feature importance with SHAP results_per_fold = Parallel(n_jobs=self.n_jobs)( delayed(self._get_feature_shap_values_per_fold)( X=current_X, y=self.y, clf=current_clf, train_index=train_index, val_index=val_index, scorer=self.scorer.scorer, verbose=self.verbose) for train_index, val_index in self.cv.split(current_X, self.y)) shap_values = np.vstack( [current_result[0] for current_result in results_per_fold]) scores_train = [ current_result[1] for current_result in results_per_fold ] scores_val = [ current_result[2] for current_result in results_per_fold ] #Calculate the shap features with remaining features and features to keep. shap_importance_df = calculate_shap_importance( shap_values, remaining_removeable_features) # Get features to remove features_to_remove = self._get_current_features_to_remove( shap_importance_df, columns_to_keep=columns_to_keep) remaining_features = list( set(current_features_set) - set(features_to_remove)) # Report results self._report_current_results( round_number=round_number, current_features_set=current_features_set, features_to_remove=features_to_remove, train_metric_mean=np.round(np.mean(scores_train), 3), train_metric_std=np.round(np.std(scores_train), 3), val_metric_mean=np.round(np.mean(scores_val), 3), val_metric_std=np.round(np.std(scores_val), 3)) if self.verbose > 50: print( f'Round: {round_number}, Current number of features: {len(current_features_set)}, ' f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation ' f'{self.report_df.loc[round_number]["val_metric_mean"]} ' f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n' f'Features left: {remaining_features}. ' f'Removed features at the end of the round: {features_to_remove}' ) self.fitted = True return self