def show_rf_feature_importance(clf, x: pd.DataFrame, y: pd.DataFrame): def fbeta2(clf, x, y): return fbeta_score(y, clf.predict(x), beta=2) importances = importances(clf, x, y, fbeta2) viz = plot_importances(importances) viz.view()
def score(self, X_test, y_test, use_rfpimp=True, use_sklearn=True): print( "The following are the results of a random forest fit to the original features appended to the Weights Matrix:" ) print("\naccuracy:", round(self.model.score(X_test, y_test), 3)) print("precision:", round(precision_score(y_test, self.y_pred), 3)) print("recall:", round(recall_score(y_test, self.y_pred), 3)) pimp_imps = rfpimp.importances(self.model, self.X, self.y) rfpimp.plot_importances( pimp_imps[0:9], yrot=0, label_fontsize=12, width=12, minheight=1.5, vscale=2.0, imp_range=(0, pimp_imps['Importance'].max() + .03), color='#484c51', bgcolor='#F1F8FE', # seaborn uses '#F1F8FE' xtick_precision=2, title='Permutation Importances') if use_sklearn == True: importances = self.model.feature_importances_ indices = np.argsort(importances)[::-1] print("\nFeature ranking:") for feat in range(0, 10): print("%d. %s (%f)" % (feat + 1, self.X.columns[indices[feat]], importances[indices[feat]])) # plot feat imps plt.figure(figsize=(12, 6)) plt.ylabel('Feature Name', size=12) plt.xlabel('Relative Feature Importance', size=12) plt.title('Sklearn Feature Importances', size=18) feat_importances = pd.Series(importances, index=self.X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.grid(color='grey', ls=':') plt.show()
def feature_import(self): ''' determine relative feature importance in model using permuation importance ''' # permutation importances returns a df with feature, importance columns X_test_df = pd.DataFrame(self.X_test) y_test_df = pd.DataFrame(self.y_test) imp = rfpimp.importances(self.model, X_test_df, y_test_df, self.colnames) viz = rfpimp.plot_importances(imp) viz.view() # Compute permutation feature importances for scikit-learn models using # k-fold cross-validation (default k=3). if self.cv is not None: cv_imp = rfpimp.cv_importances(self.model, self.X_train, self.y_train, k=self.cv)
def feature_import(model, X_test, y_test): ''' determine relative feature importance in model using permuation importance. dependency: import rfpimp input: model = sklearn model (ex. model = sklearn.regressor() ) X_test = numpy array. contains features (not target) y_test = numpy array. contains target only ''' # permutation importances returns a df with feature, importance columns colnames = X_test.columns X_test_df = pd.DataFrame(X_test) y_test_df = pd.DataFrame(y_test) imp = rfpimp.importances(model, X_test_df, y_test_df, colnames) viz = rfpimp.plot_importances(imp) print("Permutation Feature Importance") viz.view() return imp
def permutation_importances(clf, val_x, val_y, viz=False, log=True): out_dict = {} # Get feature importances via permutation if log: sys.stderr.write("o Feature permutation importances:\n\n") imp = importances(clf, val_x, val_y) for i in range(len(imp["Importance"])): key, val = imp.index[i], imp["Importance"].values[i] out_dict[key] = val if log: sys.stderr.write(key + "=" + str(val) + "\n") if viz: viz = plot_importances(imp) viz.view() viz = plot_corr_heatmap(val_x, figsize=(7,5)) viz.view() if log: sys.stderr.write("\n") return out_dict
def get_importance(args: argparse.Namespace) -> None: """ Source: https://explained.ai/rf-importance/index.html :param args: path/to/data/file.xlsx :return: feature importance, fit model, gridsearch results, and data transform mask. """ select_from_model = False transform_first = False input_ = args.input p = Path(input_) p = p.parent p = p.parent importance = p / 'importance' model_checkpoints = p / 'model_checkpoints' rf_best_params = p / 'rf_best_params' transform_mask = p / 'transform_mask' if not importance.exists(): importance.mkdir() if not model_checkpoints.exists(): model_checkpoints.mkdir() if not rf_best_params.exists(): rf_best_params.mkdir() if not transform_mask.exists(): transform_mask.mkdir() df_orig = pd.read_excel(input_) orig = df_orig.as_matrix()[:, 1:] feature_names = list(df_orig.columns)[1:-1] whereNan = np.isnan(list(orig[:, -1])) olds = orig[np.logical_not(whereNan)] news = orig[whereNan] y_train = olds[:, -1] X_train = olds[:, :-1] X_test = news[:, :-1] Xdf = pd.DataFrame(X_train, columns=feature_names) ydf = pd.Series(y_train) # Initial feature elimination if you have a predetermined mask if transform_first is True: transform_mask_init = pd.read_csv( '../transform_mask/Transform_FILENAME_HERE.csv') X_train = X_train[:, transform_mask_init['0'].as_matrix()] print("The initially masked Xdf is shape: ") print(X_train.shape) truth_series = pd.Series(transform_mask_init['0'], name='bools') Xdf = pd.DataFrame(Xdf.iloc[:, truth_series.values]) # save_new_df = pd.DataFrame(X_train) # Xdf.to_excel("test_new_cols_1.xlsx") # save_new_df.to_excel("test_1.xlsx") # Feature elimination based on importance and Select From Model method if select_from_model is True: print("Selecting the best features in your dataset.") rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, random_state=42, bootstrap=True, n_estimators=2000, max_features=0.5) print("The original Xdf is shape: ") print(X_train.shape) select_fm = sklearn.feature_selection.SelectFromModel( estimator=rf, threshold=-np.inf, max_features=8) select_fm.fit_transform(X_train, y_train) feature_conds = select_fm.get_support() transform_df = pd.DataFrame(feature_conds) transform_df.to_csv( str(transform_mask) + "/Transform_FILENAME_HERE" + str(time.strftime("%Y-%m-%d-%I-%M")) + ".csv") X_train = select_fm.transform(X_train) print("Finished transforming the data; new xdf shape is: ") print(X_train.shape) Xdf = Xdf[Xdf.columns[feature_conds]] rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, random_state=42, bootstrap=True) gs = sklearn.model_selection.GridSearchCV( rf, param_grid={ 'n_estimators': [i for i in range(10, 110, 10)], 'criterion': ['mse', 'mae'], 'max_features': [i for i in range(1, X_train.shape[1])] }, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, refit=True, verbose=1) print("Optimizing the Hyperparameters. Please be patient.") yay = gs.fit(X_train, y_train) grid_search_df = pd.DataFrame(gs.cv_results_) grid_search_df.to_csv( str(rf_best_params) + '/gridsearch_FILENAME_HERE_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv') best_results_df = pd.DataFrame(gs.best_params_, index=[0]) best_results_df.to_csv( str(rf_best_params) + '/gridsearch_Calphad_FILENAME_HERE_best_params_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv') rf = sklearn.ensemble.RandomForestRegressor(**yay.best_params_, random_state=42, n_jobs=-1, bootstrap=True, verbose=0) print( "Optimal Hyperparameters located. Fitting model to these parameters now." ) rf.fit(X_train, y_train) imp = rfpimp.importances(rf, Xdf, ydf) viz = rfpimp.plot_importances(imp) viz.save( str(importance) + f'/importances_FILENAME_HERE_-{int(time.time())}.png') viz.view() dump( rf, str(model_checkpoints) + '/model_checkpoint_FILENAME_HERE_' + str(time.strftime("%Y-%m-%d-%I-%M")) + '.joblib')
rmse_val = [] from rfpimp import importances def get_feature_imp(model, X_train, y_train, X_test, y_test, return_n_top_fetures = 15): model.fit(X_train,y_train) imp = importances(model, X_test, y_test) # print(imp) return imp.head(n=return_n_top_fetures),imp dropdata=fm_bd_model K = 13 top_10_concat_features, all_f_imp_concat = get_feature_imp(KNeighborsClassifier(n_neighbors=K), X_train, y_train, X_test, y_test) plot_importances(top_10_concat_features) top_pos = top_10_concat_features.index.values """ Add some item to get more accuracy """ add_pos = ['F12_Height', 'F12_Age', 'F12_Open', 'F12_Close_Best'] for item in add_pos: if item not in top_pos: top_pos = np.append(top_pos, item) X_train_pos = X_train[top_pos] X_test_pos = X_test[top_pos] knn = KNeighborsClassifier(n_neighbors=K) knn.fit(X_train_pos, y_train)
#dropping players who dont have variance of past season : df=df[~df['var_ppg_y'].isna()] #Simple model: X=df[['ppg_y','MP_y','Age_x','FG%_y','FGA_y','eFG%_y','FT%_y','FTA_y','3P%_y','3PA_y','PF_y','mean_ppg_y','var_ppg_y']] y=df[['ppg_x']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) rf_reg = RandomForestRegressor(max_depth=5, random_state=0,n_estimators=200) rf_reg.fit(X_train,y_train) #cross validation: scores=cross_validate(rf_reg,X=X_train,y=y_train,scoring='neg_mean_squared_error') #error -scores['test_score'].mean() #feature Importances: imp =importances(rf_reg, X_test, y_test) # permutation viz = plot_importances(imp,width=6, vscale=2) viz.view() #plotting: plt.scatter(y_test.values,y_test.values-y_pred.reshape(-1,1),alpha=0.3, c='orange') plt.title('y_test vs residuals') plt.xlabel('y_test') plt.ylabel('residuals')
clf_names = ['SVM'] skplt.metrics.plot_calibration_curve(y, [test_probs], clf_names) plot_learning_curve(svm, "SVM", Xenc, Y) if args.importances: rf = RandomForestClassifier(n_jobs=-1) Xnum = X.drop(['cc_type', 'diff_addresses'], axis=1) xnum = x.drop(['cc_type', 'diff_addresses'], axis=1) Xpoly = poly.fit_transform(Xnum) xpoly = poly.transform(xnum) Xpoly = pd.DataFrame(Xpoly, columns=poly.get_feature_names(Xnum.columns)) xpoly = pd.DataFrame(xpoly, columns=poly.get_feature_names(xnum.columns)) rf.fit(Xnum, Y) # here we assume there are no categorical variables imp = importances(rf, xnum, y) # permutation plot_importances(imp, figsize=(8, 12)) plot_corr_heatmap(Xnum, figsize=(11, 11)) if args.randomgrid: # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=1000, stop=2500, num=4)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree
def importance( self): # permutation feature importance using rfpimp library imp = importances(self.rf_reg, self.X_test, self.y_test) viz = plot_importances(imp, width=6, vscale=2) viz.view() print(imp)
#plt.savefig('Cumulative_gain.png', bbox_inches='tight') #pickle.dump(fig,open("Cumulative_gain.pickle","wb")) # endregion # region Group permutaion #To get an output which is easier to read (importance not splitted across correlated features) one should use either conditional permutation importance #(not available in python) or resort to use the library rfpimp to compute group permutations. np.random.seed(123) group_imp=importances(best_rf.named_steps["classifier"],X_test,y_test,features=list(cluster_feature),metric=custom_scorer) fig, ax = plt.subplots() ax.set(xlabel="Drop in $F_2$ score when the variable is perturbed") plot_importances(group_imp,ax=ax) plt.xticks(np.arange(min(group_imp.values), max(group_imp.values)+0.03, 0.01)) fig.set_size_inches(10,10) ax.set_xlim([0, 0.10]) fig.tight_layout() #plt.savefig('Feature_importance_group.png', bbox_inches='tight') #pickle.dump(fig,open("Feature_importance_group.pickle","wb")) # endregion # region Predictions boxplots #Variables with high importance in the predicton:
rf = RandomForestClassifier(n_estimators=150, oob_score=True, n_jobs=-1) rf.fit(X_train, y_train) print(rf.oob_score_) print( rf.score(X_test, y_test) ) oob = oob_classifier_accuracy(rf, X_train, y_train) print("oob accuracy",oob) imp = permutation_importances(rf, X_train, y_train, oob_classifier_accuracy) plot_importances(imp) stemplot_importances(imp, vscale=.7) # Using dropcol_importances imp = dropcol_importances(rf, X_train, y_train) plot_importances(imp) from rfpimp import oob_dropcol_importances imp_oob_drop = oob_dropcol_importances(rf, X_train, y_train) plot_importances(imp_oob_drop)
eli5.show_weights(perm) # rfpimp from rfpimp import importances, plot_importances def mkdf(columns, importances): I = pd.DataFrame(data={'Feature': columns, 'Importance': importances}) I = I.set_index('Feature') I = I.sort_values('Importance', ascending=False) return I imp = importances(rf, X_test, y_test) # permutation viz = plot_importances(imp) viz.view() I = mkdf(X.columns, rf.feature_importances_) I.head() viz = plot_importances( I[0:10], imp_range=(0, .4), title="Feature importance via avg drop in variance (sklearn)") ############## PARTIAL DEPENDENCE PLOTS ############## from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence