def perm_importances(model, X, y, features=None, n_examples=None, n_mc_samples=100): """ Calculate permutation importances for a BNN or its mimic. Also returns the time taken so result is a 2-tuple (array of importance values, time) Args: model: a BNN_Classifier, RandomForestClassifier or GradientBoostingClassifier X, y: examples and labels. The permutation importances are computed by shuffling columns of X and seeing how the prediction accuracy for y is affected features: How many features to compute importances for. Default (None) is to compute for every feature. Otherwise use a list of integers n_examples: How many examples to use in the computation. Default (None) uses all the features. Otherwise choose a positive integer that is less than the number of rows of X/y. n_mc_samples: number of MC samples (BNN only) Returns a 1D array of permutation importance values in the same order as the columns of X """ X_df, y_df = pd.DataFrame(X), pd.DataFrame(y) X_df.columns = X_df.columns.map( str) # rfpimp doesn't like integer column names if n_examples is None: n_examples = -1 start_time = time.time() if isinstance(model, BNN_Classifier): imp_vals = np.squeeze( rfp.importances(model, X_df, y_df, metric=lambda model, X, y, sw: model.score( X, y, n_mc_samples, sample_weight=sw), n_samples=n_examples, sort=False).values) elif isinstance(model, RandomForestClassifier) or isinstance( model, GradientBoostingClassifier): imp_vals = np.squeeze( rfp.importances(model, X_df, y_df, n_samples=n_examples, sort=False).values) time_taken = time.time() - start_time return imp_vals, time_taken
def feature_importance(x, y, **kwargs): """Calculate and display features importance :param x: features set :param y: target :keyword n_best: number of displayed features, show all if None :keyword n_jobs: number of parallel jobs :return: list of n_best most important features """ n_best = kwargs.get('n_best', None) n_jobs = kwargs.get('n_jobs', 1) best_features = [] model = RandomForestRegressor(n_estimators=50, n_jobs=n_jobs) model.fit(x, y) logger.info('%5s | %s' % ('Imp', 'Feature')) # using permutations to improve mean decrease impurity mechanism feat_imp = rfpimp.importances(model, x, y) i = 0 for index, row in feat_imp.iterrows(): logger.info('%5.2f | %s' % (row['Importance'], index)) i += 1 if n_best is not None: best_features += [index] if i >= n_best: break return best_features
def fn_imp(model, X_vl, y_vl): imp = importances(model, X_vl, y_vl, metric=make_scorer(roc_auc_score), sort=False) return imp['Importance']
def feature_importance(x, y, n_best=None, n_jobs=1): """Calculate and display features importance. """ best_features = [] model = RandomForestRegressor(n_estimators=50, n_jobs=n_jobs) model.fit(x, y) logger.info('%5s | %s' % ('Imp', 'Feature')) # using permutations to improve mean decrease impurity mechanism feat_imp = rfpimp.importances(model, x, y) i = 0 for index, row in feat_imp.iterrows(): logger.info('%5.2f | %s' % (row['Importance'], index)) i += 1 if n_best is not None: best_features += [index] if i >= n_best: break return best_features
def show_rf_feature_importance(clf, x: pd.DataFrame, y: pd.DataFrame): def fbeta2(clf, x, y): return fbeta_score(y, clf.predict(x), beta=2) importances = importances(clf, x, y, fbeta2) viz = plot_importances(importances) viz.view()
def get_feature_imp(model, X_train, y_train, X_test, y_test, return_n_top_fetures=10): # X_train, X_test, Y_train, Y_test = get_train_test_split(X, Y) model.fit(X_train, y_train) imp = importances(model, X_test, y_test) return imp.head(n=return_n_top_fetures), imp
def get_feature_imp(model, X_train, y_train, X_test, y_test, return_n_top_fetures=75): model.fit(X_train, y_train) imp = importances(model, X_test, y_test) # print(imp) return imp.head(n=return_n_top_fetures), imp
def permutation_importance(model, X_test, y_test): imp = importances(rf, X_test, y_test) viz = plot_importances(imp[0:9], yrot=0, label_fontsize=12, width=12, minheight=1.5, vscale=2.0, imp_range=(0, imp['Importance'].max() + .03), color='#484c51', bgcolor='#F1F8FE', # seaborn uses '#F1F8FE' xtick_precision=2, title='Permutation Importances')
def get_rf_feat_importance(rf:ForestRegressor, inputs:pd.DataFrame, targets:np.ndarray, weights:Optional[np.ndarray]=None) -> pd.DataFrame: r''' Compute feature importance for a Random Forest model using rfpimp. Arguments: rf: trained Random Forest model inputs: input data as Pandas DataFrame targets: target data as Numpy array weights: Optional data weights as Numpy array ''' return importances(rf, inputs, targets, features=inputs.columns, sample_weights=weights).reset_index()
def _permutationimportance(self): """ Finds the permutation importance and saves the importance.csv file in the Artefacts/exp_num folder """ X_valid = self.df_holdout.drop(columns=eval(self.params['ignorecols'])) y_valid = self.df_holdout[self.params['targetcol']] imp = importances(self.pipeline, X_valid, y_valid).reset_index() ### sum of importances should sum to 1 imp['Importance'] = imp['Importance'] / sum(imp['Importance']) return imp
def feature_import(self): ''' determine relative feature importance in model using permuation importance ''' # permutation importances returns a df with feature, importance columns X_test_df = pd.DataFrame(self.X_test) y_test_df = pd.DataFrame(self.y_test) imp = rfpimp.importances(self.model, X_test_df, y_test_df, self.colnames) viz = rfpimp.plot_importances(imp) viz.view() # Compute permutation feature importances for scikit-learn models using # k-fold cross-validation (default k=3). if self.cv is not None: cv_imp = rfpimp.cv_importances(self.model, self.X_train, self.y_train, k=self.cv)
def feature_extraction_method(self, method=Names.ELI5_PERMUTATION): print("Starting Feature Extraction...") start_time = time.time() if method == True: method = Names.ELI5_PERMUTATION if method == Names.ELI5_PERMUTATION: pi_object = PermutationImportance(self.base_run_instance.test_harness_model.model) pi_object.fit(self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use], self.base_run_instance.testing_data[self.base_run_instance.col_to_predict] ) feature_importances_df = pd.DataFrame() feature_importances_df["Feature"] = self.base_run_instance.feature_cols_to_use feature_importances_df["Importance"] = pi_object.feature_importances_ feature_importances_df["Importance_Std"] = pi_object.feature_importances_std_ feature_importances_df.sort_values(by='Importance', inplace=True, ascending=False) self.feature_importances = feature_importances_df.copy() elif method == Names.RFPIMP_PERMUTATION: pis = rfpimp.importances(self.base_run_instance.test_harness_model.model, self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use], self.base_run_instance.testing_data[self.base_run_instance.col_to_predict]) pis['Feature'] = pis.index pis.reset_index(inplace=True, drop=True) pis = pis[['Feature', 'Importance']] pis.sort_values(by='Importance', inplace=True, ascending=False) self.feature_importances = pis.copy() elif method == "sklearn_rf_default": pass # TODO elif method == Names.BBA_AUDIT: self.bba_plots_dict = {} data = self.perform_bba_audit(training_data=self.base_run_instance.training_data.copy(), testing_data=self.base_run_instance.testing_data.copy(), features=self.base_run_instance.feature_cols_to_use, classifier=self.base_run_instance.test_harness_model.model, col_to_predict=self.base_run_instance.col_to_predict) feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"]) self.feature_importances = feature_importances_df.copy() elif method == Names.SHAP_AUDIT: self.shap_plots_dict = {} data = self.perform_shap_audit() feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"]) self.feature_importances = feature_importances_df.copy() print(("Feature Extraction time with method {0} was: {1:.2f} seconds".format(method, time.time() - start_time)))
def feature_import(model, X_test, y_test): ''' determine relative feature importance in model using permuation importance. dependency: import rfpimp input: model = sklearn model (ex. model = sklearn.regressor() ) X_test = numpy array. contains features (not target) y_test = numpy array. contains target only ''' # permutation importances returns a df with feature, importance columns colnames = X_test.columns X_test_df = pd.DataFrame(X_test) y_test_df = pd.DataFrame(y_test) imp = rfpimp.importances(model, X_test_df, y_test_df, colnames) viz = rfpimp.plot_importances(imp) print("Permutation Feature Importance") viz.view() return imp
def score(self, X_test, y_test, use_rfpimp=True, use_sklearn=True): print( "The following are the results of a random forest fit to the original features appended to the Weights Matrix:" ) print("\naccuracy:", round(self.model.score(X_test, y_test), 3)) print("precision:", round(precision_score(y_test, self.y_pred), 3)) print("recall:", round(recall_score(y_test, self.y_pred), 3)) pimp_imps = rfpimp.importances(self.model, self.X, self.y) rfpimp.plot_importances( pimp_imps[0:9], yrot=0, label_fontsize=12, width=12, minheight=1.5, vscale=2.0, imp_range=(0, pimp_imps['Importance'].max() + .03), color='#484c51', bgcolor='#F1F8FE', # seaborn uses '#F1F8FE' xtick_precision=2, title='Permutation Importances') if use_sklearn == True: importances = self.model.feature_importances_ indices = np.argsort(importances)[::-1] print("\nFeature ranking:") for feat in range(0, 10): print("%d. %s (%f)" % (feat + 1, self.X.columns[indices[feat]], importances[indices[feat]])) # plot feat imps plt.figure(figsize=(12, 6)) plt.ylabel('Feature Name', size=12) plt.xlabel('Relative Feature Importance', size=12) plt.title('Sklearn Feature Importances', size=18) feat_importances = pd.Series(importances, index=self.X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.grid(color='grey', ls=':') plt.show()
def permutation_importances(clf, val_x, val_y, viz=False, log=True): out_dict = {} # Get feature importances via permutation if log: sys.stderr.write("o Feature permutation importances:\n\n") imp = importances(clf, val_x, val_y) for i in range(len(imp["Importance"])): key, val = imp.index[i], imp["Importance"].values[i] out_dict[key] = val if log: sys.stderr.write(key + "=" + str(val) + "\n") if viz: viz = plot_importances(imp) viz.view() viz = plot_corr_heatmap(val_x, figsize=(7,5)) viz.view() if log: sys.stderr.write("\n") return out_dict
metrics.precision_score(VAL_Y, PRD_Y, average='weighted'), metrics.recall_score(VAL_Y, PRD_Y, average='weighted'), metrics.jaccard_score(VAL_Y, PRD_Y, average='weighted')) report = metrics.classification_report(VAL_Y, PRD_Y) confusionMat = metrics.plot_confusion_matrix( rf, VAL_X, VAL_Y, display_labels=list(range(len(set(outputs[outputs.columns[0]])))), cmap=cm.Blues, normalize=None) plt.savefig(strMod + '_RF.jpg', dpi=300) featImportance = list(rf.feature_importances_) impDC = rfp.oob_dropcol_importances(rf, TRN_X, TRN_Y.values.ravel()) impDCD = impDC.to_dict()['Importance'] impPM = rfp.importances(rf, TRN_X, TRN_Y) impPMD = impPM.to_dict()['Importance'] ########################################################################### # Interpretability Plots ########################################################################### feat = FEATS[4] for feat in FEATS: isolate = pdp.pdp_isolate(model=rf, dataset=TRN_X, model_features=FEATS, feature=feat) fracPlot = 2500 (fig, axes) = pdp.pdp_plot(pdp_isolate_out=isolate, feature_name=feat, center=False, x_quantile=True,
def get_importance(args: argparse.Namespace) -> None: """ Source: https://explained.ai/rf-importance/index.html :param args: path/to/data/file.xlsx :return: feature importance, fit model, gridsearch results, and data transform mask. """ select_from_model = False transform_first = False input_ = args.input p = Path(input_) p = p.parent p = p.parent importance = p / 'importance' model_checkpoints = p / 'model_checkpoints' rf_best_params = p / 'rf_best_params' transform_mask = p / 'transform_mask' if not importance.exists(): importance.mkdir() if not model_checkpoints.exists(): model_checkpoints.mkdir() if not rf_best_params.exists(): rf_best_params.mkdir() if not transform_mask.exists(): transform_mask.mkdir() df_orig = pd.read_excel(input_) orig = df_orig.as_matrix()[:, 1:] feature_names = list(df_orig.columns)[1:-1] whereNan = np.isnan(list(orig[:, -1])) olds = orig[np.logical_not(whereNan)] news = orig[whereNan] y_train = olds[:, -1] X_train = olds[:, :-1] X_test = news[:, :-1] Xdf = pd.DataFrame(X_train, columns=feature_names) ydf = pd.Series(y_train) # Initial feature elimination if you have a predetermined mask if transform_first is True: transform_mask_init = pd.read_csv( '../transform_mask/Transform_FILENAME_HERE.csv') X_train = X_train[:, transform_mask_init['0'].as_matrix()] print("The initially masked Xdf is shape: ") print(X_train.shape) truth_series = pd.Series(transform_mask_init['0'], name='bools') Xdf = pd.DataFrame(Xdf.iloc[:, truth_series.values]) # save_new_df = pd.DataFrame(X_train) # Xdf.to_excel("test_new_cols_1.xlsx") # save_new_df.to_excel("test_1.xlsx") # Feature elimination based on importance and Select From Model method if select_from_model is True: print("Selecting the best features in your dataset.") rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, random_state=42, bootstrap=True, n_estimators=2000, max_features=0.5) print("The original Xdf is shape: ") print(X_train.shape) select_fm = sklearn.feature_selection.SelectFromModel( estimator=rf, threshold=-np.inf, max_features=8) select_fm.fit_transform(X_train, y_train) feature_conds = select_fm.get_support() transform_df = pd.DataFrame(feature_conds) transform_df.to_csv( str(transform_mask) + "/Transform_FILENAME_HERE" + str(time.strftime("%Y-%m-%d-%I-%M")) + ".csv") X_train = select_fm.transform(X_train) print("Finished transforming the data; new xdf shape is: ") print(X_train.shape) Xdf = Xdf[Xdf.columns[feature_conds]] rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, random_state=42, bootstrap=True) gs = sklearn.model_selection.GridSearchCV( rf, param_grid={ 'n_estimators': [i for i in range(10, 110, 10)], 'criterion': ['mse', 'mae'], 'max_features': [i for i in range(1, X_train.shape[1])] }, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, refit=True, verbose=1) print("Optimizing the Hyperparameters. Please be patient.") yay = gs.fit(X_train, y_train) grid_search_df = pd.DataFrame(gs.cv_results_) grid_search_df.to_csv( str(rf_best_params) + '/gridsearch_FILENAME_HERE_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv') best_results_df = pd.DataFrame(gs.best_params_, index=[0]) best_results_df.to_csv( str(rf_best_params) + '/gridsearch_Calphad_FILENAME_HERE_best_params_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv') rf = sklearn.ensemble.RandomForestRegressor(**yay.best_params_, random_state=42, n_jobs=-1, bootstrap=True, verbose=0) print( "Optimal Hyperparameters located. Fitting model to these parameters now." ) rf.fit(X_train, y_train) imp = rfpimp.importances(rf, Xdf, ydf) viz = rfpimp.plot_importances(imp) viz.save( str(importance) + f'/importances_FILENAME_HERE_-{int(time.time())}.png') viz.view() dump( rf, str(model_checkpoints) + '/model_checkpoint_FILENAME_HERE_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.joblib')
#dropping players who dont have variance of past season : df=df[~df['var_ppg_y'].isna()] #Simple model: X=df[['ppg_y','MP_y','Age_x','FG%_y','FGA_y','eFG%_y','FT%_y','FTA_y','3P%_y','3PA_y','PF_y','mean_ppg_y','var_ppg_y']] y=df[['ppg_x']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) rf_reg = RandomForestRegressor(max_depth=5, random_state=0,n_estimators=200) rf_reg.fit(X_train,y_train) #cross validation: scores=cross_validate(rf_reg,X=X_train,y=y_train,scoring='neg_mean_squared_error') #error -scores['test_score'].mean() #feature Importances: imp =importances(rf_reg, X_test, y_test) # permutation viz = plot_importances(imp,width=6, vscale=2) viz.view() #plotting: plt.scatter(y_test.values,y_test.values-y_pred.reshape(-1,1),alpha=0.3, c='orange') plt.title('y_test vs residuals') plt.xlabel('y_test') plt.ylabel('residuals')
def get_multiple_imps(dataset, X, y, X_train, y_train, X_test, y_test, n_shap=300, drop_high_variance_features=True, sortby='Importance', stratpd_min_samples_leaf=15, stratpd_cat_min_samples_leaf=5, imp_n_trials=1, imp_pvalues_n_trials=0, n_stratpd_trees=1, rf_bootstrap=False, bootstrap=True, catcolnames=set(), min_slopes_per_x=5, supervised=True, # include=['Spearman', 'PCA', 'OLS', 'OLS SHAP', 'RF SHAP', "RF perm", 'StratImpact'], normalize=True): spear_I = pca_I = ols_I = ols_shap_I = rf_I = perm_I = ours_I = None # Do everything now include = ['Spearman', 'PCA', 'OLS', 'OLS SHAP', 'RF SHAP', "RF perm", 'StratImpact'] # include = ['StratImpact'] if dataset=='bulldozer': include.remove('OLS') include.remove('OLS SHAP') if 'Spearman' in include: spear_I = spearmans_importances(X, y) if 'PCA' in include: pca_I = pca_importances(X) if "OLS" in include: # since we use coefficients, look at all data X_ = StandardScaler().fit_transform(X) X_ = pd.DataFrame(X_, columns=X.columns) lm = LinearRegression() lm.fit(X_, y) ols_I, score = linear_model_importance(lm, X_, y) print("OLS\n",ols_I) if "OLS SHAP" in include: # since we use coefficients, look at all data, explain n_shap X_ = StandardScaler().fit_transform(X) X_ = pd.DataFrame(X_, columns=X.columns) lm = LinearRegression() lm.fit(X_, y) ols_shap_I = shap_importances(lm, X_, X_, n_shap=n_shap) if "RF SHAP" in include: tuned_params = models[(dataset, "RF")] rf = RandomForestRegressor(**tuned_params, n_jobs=-1) rf.fit(X_train, y_train) rf_I = shap_importances(rf, X_train, X_test, n_shap, normalize=normalize) print("RF SHAP\n",rf_I) if "RF perm" in include: tuned_params = models[(dataset, "RF")] rf = RandomForestRegressor(**tuned_params, n_jobs=-1) rf.fit(X_train, y_train) perm_I = rfpimp.importances(rf, X_test, y_test) # permutation; drop in test accuracy print("RF perm\n",perm_I) if "StratImpact" in include: # RF SHAP and RF perm get to look at the test data to decide which features # are more predictive and useful for generality's sake # So, we get to look at all data as well, not just training data # Actually we use just training again after fixing featimp measure. (May 17, 2020) ours_I = featimp.importances(X_train, y_train, verbose=False, sortby=sortby, min_samples_leaf=stratpd_min_samples_leaf, cat_min_samples_leaf=stratpd_cat_min_samples_leaf, n_trials=imp_n_trials, pvalues=imp_pvalues_n_trials > 0, pvalues_n_trials=imp_pvalues_n_trials, n_trees=n_stratpd_trees, bootstrap=bootstrap, rf_bootstrap=rf_bootstrap, catcolnames=catcolnames, min_slopes_per_x=min_slopes_per_x, supervised=supervised, normalize=normalize, drop_high_stddev=2.0 if drop_high_variance_features else 9999) print("OURS\n",ours_I) if "PDP" in include: tuned_params = models[(dataset, "RF")] rf = RandomForestRegressor(**tuned_params, n_jobs=-1) rf.fit(X, y) pdpy = featimp.friedman_partial_dependences(rf, X, mean_centered=True) pdp_I = pd.DataFrame(data={'Feature': X.columns}) pdp_I = pdp_I.set_index('Feature') pdp_I['Importance'] = np.mean(np.mean(np.abs(pdpy)), axis=1) d = OrderedDict() d['Spearman'] = spear_I d['PCA'] = pca_I d['OLS'] = ols_I d['OLS SHAP'] = ols_shap_I d['RF SHAP'] = rf_I d["RF perm"] = perm_I d['Strat'] = ours_I # Put both orders for Strat approach into same imps dictionary I = featimp.Isortby(ours_I, 'Importance') d['StratImport'] = pd.DataFrame(I['Importance']) I = featimp.Isortby(ours_I, 'Impact') d['StratImpact'] = pd.DataFrame(I['Impact']) print(d['StratImport']) print(d['StratImpact']) return d
def eval_importance( self, groups: Optional[Dict[str, Sequence[str]]] = None, n_times: int = 10, ignore: Optional[Union[str, Sequence[str]]] = None, n_jobs: int = 1, ) -> pd.DataFrame: """ Evaluate permutation feature importances. Args: groups: Groups of related features. One feature can appear on several groups at the same time. n_times: Number of times to calculate importances. Uses the mean of results. ignore: Features to ignore during dropping. n_jobs: Number of CPUs to use. -1 to use all available. Returns: DataFrame """ # Prepare features list (or nested list). features = self.features if ignore: features = [feat for feat in features if feat not in ignore] if groups: features = self._manage_groups(groups, features) # Split dataset. n_samples = 5000 ratio = 0.2 datasets = autolearn.split(x=self._x, y=self._y, test_samples=n_samples, test_ratio=ratio) x_train, x_test, y_train, y_test = datasets model = autolearn.Model(task=self.task) model.tune(x_train, y_train, test_ratio=ratio, n_jobs=n_jobs) model.fit(x_train, y_train) kwargs = { "model": model, "X_valid": x_test, "y_valid": y_test, "features": features, "n_samples": -1, } # Get importances. imps = [rfpimp.importances(**kwargs) for _ in range(n_times)] imp = pd.concat(imps).groupby(level=0).mean() imp = imp.sort_values("Importance", ascending=False) # Create new columns. # Handle Negative values by adding its module to all values. non_negatives = imp["Importance"].add(np.abs(imp["Importance"].min())) imp["Normalised Importance"] = non_negatives / non_negatives.sum() imp["Cumulative Importance"] = imp["Normalised Importance"].cumsum() self._importances = imp return imp
skplt.metrics.plot_roc(y, test_probs) clf_names = ['SVM'] skplt.metrics.plot_calibration_curve(y, [test_probs], clf_names) plot_learning_curve(svm, "SVM", Xenc, Y) if args.importances: rf = RandomForestClassifier(n_jobs=-1) Xnum = X.drop(['cc_type', 'diff_addresses'], axis=1) xnum = x.drop(['cc_type', 'diff_addresses'], axis=1) Xpoly = poly.fit_transform(Xnum) xpoly = poly.transform(xnum) Xpoly = pd.DataFrame(Xpoly, columns=poly.get_feature_names(Xnum.columns)) xpoly = pd.DataFrame(xpoly, columns=poly.get_feature_names(xnum.columns)) rf.fit(Xnum, Y) # here we assume there are no categorical variables imp = importances(rf, xnum, y) # permutation plot_importances(imp, figsize=(8, 12)) plot_corr_heatmap(Xnum, figsize=(11, 11)) if args.randomgrid: # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=1000, stop=2500, num=4)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4]
random_sel = [10 * i + 10 for i in range(0,10)] ax.plot(cumulative_sum_clients,random_sel,label="Baseline") plt.legend(loc="upper left") #plt.savefig('Cumulative_gain.png', bbox_inches='tight') #pickle.dump(fig,open("Cumulative_gain.pickle","wb")) # endregion # region Group permutaion #To get an output which is easier to read (importance not splitted across correlated features) one should use either conditional permutation importance #(not available in python) or resort to use the library rfpimp to compute group permutations. np.random.seed(123) group_imp=importances(best_rf.named_steps["classifier"],X_test,y_test,features=list(cluster_feature),metric=custom_scorer) fig, ax = plt.subplots() ax.set(xlabel="Drop in $F_2$ score when the variable is perturbed") plot_importances(group_imp,ax=ax) plt.xticks(np.arange(min(group_imp.values), max(group_imp.values)+0.03, 0.01)) fig.set_size_inches(10,10) ax.set_xlim([0, 0.10]) fig.tight_layout() #plt.savefig('Feature_importance_group.png', bbox_inches='tight') #pickle.dump(fig,open("Feature_importance_group.pickle","wb")) # endregion
def importance( self): # permutation feature importance using rfpimp library imp = importances(self.rf_reg, self.X_test, self.y_test) viz = plot_importances(imp, width=6, vscale=2) viz.view() print(imp)