def show_rf_feature_importance(clf, x: pd.DataFrame, y: pd.DataFrame):
    def fbeta2(clf, x, y):
        return fbeta_score(y, clf.predict(x), beta=2)

    importances = importances(clf, x, y, fbeta2)
    viz = plot_importances(importances)
    viz.view()
Beispiel #2
0
    def score(self, X_test, y_test, use_rfpimp=True, use_sklearn=True):

        print(
            "The following are the results of a random forest fit to the original features appended to the Weights Matrix:"
        )
        print("\naccuracy:", round(self.model.score(X_test, y_test), 3))
        print("precision:", round(precision_score(y_test, self.y_pred), 3))
        print("recall:", round(recall_score(y_test, self.y_pred), 3))

        pimp_imps = rfpimp.importances(self.model, self.X, self.y)
        rfpimp.plot_importances(
            pimp_imps[0:9],
            yrot=0,
            label_fontsize=12,
            width=12,
            minheight=1.5,
            vscale=2.0,
            imp_range=(0, pimp_imps['Importance'].max() + .03),
            color='#484c51',
            bgcolor='#F1F8FE',  # seaborn uses '#F1F8FE'
            xtick_precision=2,
            title='Permutation Importances')

        if use_sklearn == True:
            importances = self.model.feature_importances_
            indices = np.argsort(importances)[::-1]

            print("\nFeature ranking:")
            for feat in range(0, 10):
                print("%d. %s (%f)" % (feat + 1, self.X.columns[indices[feat]],
                                       importances[indices[feat]]))

            # plot feat imps
            plt.figure(figsize=(12, 6))
            plt.ylabel('Feature Name', size=12)
            plt.xlabel('Relative Feature Importance', size=12)
            plt.title('Sklearn Feature Importances', size=18)
            feat_importances = pd.Series(importances, index=self.X.columns)
            feat_importances.nlargest(10).plot(kind='barh')
            plt.grid(color='grey', ls=':')
            plt.show()
Beispiel #3
0
    def feature_import(self):
        '''
        determine relative feature importance in model using permuation importance
        '''
        # permutation importances returns a df with feature, importance columns
        X_test_df = pd.DataFrame(self.X_test)
        y_test_df = pd.DataFrame(self.y_test)
        imp = rfpimp.importances(self.model, X_test_df, y_test_df, self.colnames)
        viz = rfpimp.plot_importances(imp)
        viz.view()

        # Compute permutation feature importances for scikit-learn models using
        # k-fold cross-validation (default k=3).
        if self.cv is not None:
            cv_imp = rfpimp.cv_importances(self.model, self.X_train, self.y_train, k=self.cv)
Beispiel #4
0
def feature_import(model, X_test, y_test):
    '''
    determine relative feature importance in model using permuation importance.
    dependency: import rfpimp  
    input:
        model = sklearn model (ex. model = sklearn.regressor() )
        X_test = numpy array. contains features (not target)
        y_test = numpy array. contains target only
    '''
    # permutation importances returns a df with feature, importance columns
    colnames = X_test.columns
    X_test_df = pd.DataFrame(X_test)
    y_test_df = pd.DataFrame(y_test)
    imp = rfpimp.importances(model, X_test_df, y_test_df, colnames)
    viz = rfpimp.plot_importances(imp)
    print("Permutation Feature Importance")
    viz.view()
    return imp
Beispiel #5
0
def permutation_importances(clf, val_x, val_y, viz=False, log=True):
	out_dict = {}

	# Get feature importances via permutation
	if log:
		sys.stderr.write("o Feature permutation importances:\n\n")
	imp = importances(clf, val_x, val_y)
	for i in range(len(imp["Importance"])):
		key, val = imp.index[i], imp["Importance"].values[i]
		out_dict[key] = val
		if log:
			sys.stderr.write(key + "=" + str(val) + "\n")
	if viz:
		viz = plot_importances(imp)
		viz.view()

		viz = plot_corr_heatmap(val_x, figsize=(7,5))
		viz.view()
	if log:
		sys.stderr.write("\n")

	return out_dict
Beispiel #6
0
def get_importance(args: argparse.Namespace) -> None:
    """

    Source: https://explained.ai/rf-importance/index.html

    :param args: path/to/data/file.xlsx
    :return: feature importance, fit model, gridsearch results, and data transform mask.
    """

    select_from_model = False
    transform_first = False

    input_ = args.input

    p = Path(input_)
    p = p.parent
    p = p.parent

    importance = p / 'importance'
    model_checkpoints = p / 'model_checkpoints'
    rf_best_params = p / 'rf_best_params'
    transform_mask = p / 'transform_mask'

    if not importance.exists():
        importance.mkdir()

    if not model_checkpoints.exists():
        model_checkpoints.mkdir()

    if not rf_best_params.exists():
        rf_best_params.mkdir()

    if not transform_mask.exists():
        transform_mask.mkdir()

    df_orig = pd.read_excel(input_)

    orig = df_orig.as_matrix()[:, 1:]

    feature_names = list(df_orig.columns)[1:-1]

    whereNan = np.isnan(list(orig[:, -1]))

    olds = orig[np.logical_not(whereNan)]

    news = orig[whereNan]

    y_train = olds[:, -1]
    X_train = olds[:, :-1]

    X_test = news[:, :-1]

    Xdf = pd.DataFrame(X_train, columns=feature_names)
    ydf = pd.Series(y_train)

    # Initial feature elimination if you have a predetermined mask
    if transform_first is True:
        transform_mask_init = pd.read_csv(
            '../transform_mask/Transform_FILENAME_HERE.csv')
        X_train = X_train[:, transform_mask_init['0'].as_matrix()]

        print("The initially masked Xdf is shape: ")
        print(X_train.shape)

        truth_series = pd.Series(transform_mask_init['0'], name='bools')
        Xdf = pd.DataFrame(Xdf.iloc[:, truth_series.values])

        # save_new_df = pd.DataFrame(X_train)
        # Xdf.to_excel("test_new_cols_1.xlsx")
        # save_new_df.to_excel("test_1.xlsx")

    # Feature elimination based on importance and Select From Model method
    if select_from_model is True:
        print("Selecting the best features in your dataset.")
        rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                    random_state=42,
                                                    bootstrap=True,
                                                    n_estimators=2000,
                                                    max_features=0.5)

        print("The original Xdf is shape: ")
        print(X_train.shape)

        select_fm = sklearn.feature_selection.SelectFromModel(
            estimator=rf, threshold=-np.inf, max_features=8)

        select_fm.fit_transform(X_train, y_train)

        feature_conds = select_fm.get_support()
        transform_df = pd.DataFrame(feature_conds)
        transform_df.to_csv(
            str(transform_mask) + "/Transform_FILENAME_HERE" +
            str(time.strftime("%Y-%m-%d-%I-%M")) + ".csv")
        X_train = select_fm.transform(X_train)

        print("Finished transforming the data; new xdf shape is: ")
        print(X_train.shape)

        Xdf = Xdf[Xdf.columns[feature_conds]]

    rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                random_state=42,
                                                bootstrap=True)

    gs = sklearn.model_selection.GridSearchCV(
        rf,
        param_grid={
            'n_estimators': [i for i in range(10, 110, 10)],
            'criterion': ['mse', 'mae'],
            'max_features': [i for i in range(1, X_train.shape[1])]
        },
        scoring='neg_mean_absolute_error',
        cv=5,
        n_jobs=-1,
        refit=True,
        verbose=1)

    print("Optimizing the Hyperparameters. Please be patient.")
    yay = gs.fit(X_train, y_train)

    grid_search_df = pd.DataFrame(gs.cv_results_)
    grid_search_df.to_csv(
        str(rf_best_params) + '/gridsearch_FILENAME_HERE_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv')
    best_results_df = pd.DataFrame(gs.best_params_, index=[0])
    best_results_df.to_csv(
        str(rf_best_params) +
        '/gridsearch_Calphad_FILENAME_HERE_best_params_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv')

    rf = sklearn.ensemble.RandomForestRegressor(**yay.best_params_,
                                                random_state=42,
                                                n_jobs=-1,
                                                bootstrap=True,
                                                verbose=0)

    print(
        "Optimal Hyperparameters located. Fitting model to these parameters now."
    )
    rf.fit(X_train, y_train)

    imp = rfpimp.importances(rf, Xdf, ydf)

    viz = rfpimp.plot_importances(imp)
    viz.save(
        str(importance) +
        f'/importances_FILENAME_HERE_-{int(time.time())}.png')
    viz.view()

    dump(
        rf,
        str(model_checkpoints) + '/model_checkpoint_FILENAME_HERE_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.joblib')
Beispiel #7
0
rmse_val = []

from rfpimp import importances

def get_feature_imp(model, X_train, y_train, X_test, y_test, return_n_top_fetures = 15):

    model.fit(X_train,y_train)
    imp = importances(model, X_test, y_test)
    # print(imp)
    return imp.head(n=return_n_top_fetures),imp

dropdata=fm_bd_model

K = 13
top_10_concat_features, all_f_imp_concat = get_feature_imp(KNeighborsClassifier(n_neighbors=K), X_train, y_train, X_test, y_test)
plot_importances(top_10_concat_features)
top_pos = top_10_concat_features.index.values

"""
Add some item to get more accuracy
"""
add_pos = ['F12_Height', 'F12_Age', 'F12_Open', 'F12_Close_Best']
for item in add_pos:
    if item not in top_pos:
        top_pos = np.append(top_pos, item)

X_train_pos = X_train[top_pos]
X_test_pos = X_test[top_pos]

knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X_train_pos, y_train)
Beispiel #8
0
#dropping players who dont have variance of past season :
df=df[~df['var_ppg_y'].isna()]

#Simple model:
X=df[['ppg_y','MP_y','Age_x','FG%_y','FGA_y','eFG%_y','FT%_y','FTA_y','3P%_y','3PA_y','PF_y','mean_ppg_y','var_ppg_y']]
y=df[['ppg_x']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rf_reg = RandomForestRegressor(max_depth=5, random_state=0,n_estimators=200)
rf_reg.fit(X_train,y_train)

#cross validation:
scores=cross_validate(rf_reg,X=X_train,y=y_train,scoring='neg_mean_squared_error')

#error

-scores['test_score'].mean()


#feature Importances:
imp =importances(rf_reg, X_test, y_test) # permutation
viz = plot_importances(imp,width=6, vscale=2)
viz.view()

#plotting:

plt.scatter(y_test.values,y_test.values-y_pred.reshape(-1,1),alpha=0.3, c='orange')
plt.title('y_test vs residuals')
plt.xlabel('y_test')
plt.ylabel('residuals')
Beispiel #9
0
    clf_names = ['SVM']
    skplt.metrics.plot_calibration_curve(y, [test_probs], clf_names)
    plot_learning_curve(svm, "SVM", Xenc, Y)

if args.importances:
    rf = RandomForestClassifier(n_jobs=-1)
    Xnum = X.drop(['cc_type', 'diff_addresses'], axis=1)
    xnum = x.drop(['cc_type', 'diff_addresses'], axis=1)
    Xpoly = poly.fit_transform(Xnum)
    xpoly = poly.transform(xnum)
    Xpoly = pd.DataFrame(Xpoly, columns=poly.get_feature_names(Xnum.columns))
    xpoly = pd.DataFrame(xpoly, columns=poly.get_feature_names(xnum.columns))
    rf.fit(Xnum, Y)
    # here we assume there are no categorical variables
    imp = importances(rf, xnum, y)  # permutation
    plot_importances(imp, figsize=(8, 12))
    plot_corr_heatmap(Xnum, figsize=(11, 11))

if args.randomgrid:
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start=1000, stop=2500, num=4)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
Beispiel #10
0
 def importance(
         self):  # permutation feature importance using rfpimp library
     imp = importances(self.rf_reg, self.X_test, self.y_test)
     viz = plot_importances(imp, width=6, vscale=2)
     viz.view()
     print(imp)
#plt.savefig('Cumulative_gain.png', bbox_inches='tight')
#pickle.dump(fig,open("Cumulative_gain.pickle","wb"))

# endregion


# region Group permutaion
#To get an output which is easier to read (importance not splitted across correlated features) one should use either conditional permutation importance
#(not available in python) or resort to use the library rfpimp to compute group permutations.

np.random.seed(123)
group_imp=importances(best_rf.named_steps["classifier"],X_test,y_test,features=list(cluster_feature),metric=custom_scorer)
fig, ax = plt.subplots()
ax.set(xlabel="Drop in $F_2$ score when the variable is perturbed")
plot_importances(group_imp,ax=ax)
plt.xticks(np.arange(min(group_imp.values), max(group_imp.values)+0.03, 0.01))
fig.set_size_inches(10,10)
ax.set_xlim([0, 0.10])
fig.tight_layout()




#plt.savefig('Feature_importance_group.png', bbox_inches='tight')
#pickle.dump(fig,open("Feature_importance_group.pickle","wb"))
# endregion


# region Predictions boxplots
#Variables with high importance in the predicton:
rf = RandomForestClassifier(n_estimators=150,
                            oob_score=True,
                            n_jobs=-1)


rf.fit(X_train, y_train)
print(rf.oob_score_)
print( rf.score(X_test, y_test) )


oob = oob_classifier_accuracy(rf, X_train, y_train)
print("oob accuracy",oob)

imp = permutation_importances(rf, X_train, y_train,
                              oob_classifier_accuracy)
plot_importances(imp)
stemplot_importances(imp, vscale=.7)

# Using dropcol_importances

imp = dropcol_importances(rf, X_train, y_train)
plot_importances(imp)


from rfpimp import oob_dropcol_importances

imp_oob_drop = oob_dropcol_importances(rf, X_train, y_train)

plot_importances(imp_oob_drop)

Beispiel #13
0
eli5.show_weights(perm)

# rfpimp

from rfpimp import importances, plot_importances


def mkdf(columns, importances):
    I = pd.DataFrame(data={'Feature': columns, 'Importance': importances})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=False)
    return I


imp = importances(rf, X_test, y_test)  # permutation
viz = plot_importances(imp)
viz.view()

I = mkdf(X.columns, rf.feature_importances_)
I.head()

viz = plot_importances(
    I[0:10],
    imp_range=(0, .4),
    title="Feature importance via avg drop in variance (sklearn)")

############## PARTIAL DEPENDENCE PLOTS ##############

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence