Esempio n. 1
0
def perm_importances(model,
                     X,
                     y,
                     features=None,
                     n_examples=None,
                     n_mc_samples=100):
    """
	Calculate permutation importances for a BNN or its mimic. Also returns the time taken
    so result is a 2-tuple (array of importance values, time)

	Args:
		model: a BNN_Classifier, RandomForestClassifier or GradientBoostingClassifier
		X, y: examples and labels. The permutation importances are computed by shuffling columns
			  of X and seeing how the prediction accuracy for y is affected
		features: How many features to compute importances for. Default (None) is to compute
				  for every feature. Otherwise use a list of integers
		n_examples: How many examples to use in the computation. Default (None) uses all the
					features. Otherwise choose a positive integer that is less than 
					the number of rows of X/y.
		n_mc_samples: number of MC samples (BNN only)

	Returns a 1D array of permutation importance values in the same order as the columns of X
	"""
    X_df, y_df = pd.DataFrame(X), pd.DataFrame(y)
    X_df.columns = X_df.columns.map(
        str)  # rfpimp doesn't like integer column names

    if n_examples is None:
        n_examples = -1
    start_time = time.time()
    if isinstance(model, BNN_Classifier):
        imp_vals = np.squeeze(
            rfp.importances(model,
                            X_df,
                            y_df,
                            metric=lambda model, X, y, sw: model.score(
                                X, y, n_mc_samples, sample_weight=sw),
                            n_samples=n_examples,
                            sort=False).values)
    elif isinstance(model, RandomForestClassifier) or isinstance(
            model, GradientBoostingClassifier):
        imp_vals = np.squeeze(
            rfp.importances(model,
                            X_df,
                            y_df,
                            n_samples=n_examples,
                            sort=False).values)
    time_taken = time.time() - start_time
    return imp_vals, time_taken
Esempio n. 2
0
def feature_importance(x, y, **kwargs):
    """Calculate and display features importance

    :param x: features set
    :param y: target
    :keyword n_best: number of displayed features, show all if None
    :keyword n_jobs: number of parallel jobs
    :return: list of n_best most important features
    """
    n_best = kwargs.get('n_best', None)
    n_jobs = kwargs.get('n_jobs', 1)

    best_features = []

    model = RandomForestRegressor(n_estimators=50, n_jobs=n_jobs)
    model.fit(x, y)

    logger.info('%5s | %s' % ('Imp', 'Feature'))

    # using permutations to improve mean decrease impurity mechanism
    feat_imp = rfpimp.importances(model, x, y)
    i = 0

    for index, row in feat_imp.iterrows():
        logger.info('%5.2f | %s' % (row['Importance'], index))
        i += 1

        if n_best is not None:
            best_features += [index]

            if i >= n_best:
                break

    return best_features
Esempio n. 3
0
def fn_imp(model, X_vl, y_vl):
    imp = importances(model,
                      X_vl,
                      y_vl,
                      metric=make_scorer(roc_auc_score),
                      sort=False)
    return imp['Importance']
Esempio n. 4
0
def feature_importance(x, y, n_best=None, n_jobs=1):
    """Calculate and display features importance.
    """

    best_features = []

    model = RandomForestRegressor(n_estimators=50, n_jobs=n_jobs)
    model.fit(x, y)

    logger.info('%5s | %s' % ('Imp', 'Feature'))

    # using permutations to improve mean decrease impurity mechanism
    feat_imp = rfpimp.importances(model, x, y)
    i = 0

    for index, row in feat_imp.iterrows():
        logger.info('%5.2f | %s' % (row['Importance'], index))
        i += 1

        if n_best is not None:
            best_features += [index]

            if i >= n_best:
                break

    return best_features
def show_rf_feature_importance(clf, x: pd.DataFrame, y: pd.DataFrame):
    def fbeta2(clf, x, y):
        return fbeta_score(y, clf.predict(x), beta=2)

    importances = importances(clf, x, y, fbeta2)
    viz = plot_importances(importances)
    viz.view()
Esempio n. 6
0
def get_feature_imp(model,
                    X_train,
                    y_train,
                    X_test,
                    y_test,
                    return_n_top_fetures=10):
    # X_train, X_test, Y_train, Y_test = get_train_test_split(X, Y)
    model.fit(X_train, y_train)
    imp = importances(model, X_test, y_test)
    return imp.head(n=return_n_top_fetures), imp
Esempio n. 7
0
def get_feature_imp(model,
                    X_train,
                    y_train,
                    X_test,
                    y_test,
                    return_n_top_fetures=75):

    model.fit(X_train, y_train)
    imp = importances(model, X_test, y_test)
    # print(imp)
    return imp.head(n=return_n_top_fetures), imp
Esempio n. 8
0
def permutation_importance(model, X_test, y_test):
    imp = importances(rf, X_test, y_test)
    viz = plot_importances(imp[0:9],  yrot=0,
                            label_fontsize=12,
                            width=12,
                            minheight=1.5,
                            vscale=2.0,
                            imp_range=(0, imp['Importance'].max() + .03),
                            color='#484c51',
                            bgcolor='#F1F8FE',  # seaborn uses '#F1F8FE'
                            xtick_precision=2,
                            title='Permutation Importances')
Esempio n. 9
0
def get_rf_feat_importance(rf:ForestRegressor, inputs:pd.DataFrame, targets:np.ndarray, weights:Optional[np.ndarray]=None) -> pd.DataFrame:
    r'''
    Compute feature importance for a Random Forest model using rfpimp.

    Arguments:
        rf: trained Random Forest model
        inputs: input data as Pandas DataFrame
        targets: target data as Numpy array
        weights: Optional data weights as Numpy array
    '''

    return importances(rf, inputs, targets, features=inputs.columns, sample_weights=weights).reset_index()
Esempio n. 10
0
    def _permutationimportance(self):
        """
        Finds the permutation importance and saves the importance.csv file
        in the Artefacts/exp_num folder
        """
        X_valid = self.df_holdout.drop(columns=eval(self.params['ignorecols']))
        y_valid = self.df_holdout[self.params['targetcol']]

        imp = importances(self.pipeline, X_valid, y_valid).reset_index()

        ### sum of importances should sum to 1
        imp['Importance'] = imp['Importance'] / sum(imp['Importance'])

        return imp
Esempio n. 11
0
    def feature_import(self):
        '''
        determine relative feature importance in model using permuation importance
        '''
        # permutation importances returns a df with feature, importance columns
        X_test_df = pd.DataFrame(self.X_test)
        y_test_df = pd.DataFrame(self.y_test)
        imp = rfpimp.importances(self.model, X_test_df, y_test_df, self.colnames)
        viz = rfpimp.plot_importances(imp)
        viz.view()

        # Compute permutation feature importances for scikit-learn models using
        # k-fold cross-validation (default k=3).
        if self.cv is not None:
            cv_imp = rfpimp.cv_importances(self.model, self.X_train, self.y_train, k=self.cv)
Esempio n. 12
0
    def feature_extraction_method(self, method=Names.ELI5_PERMUTATION):
        print("Starting Feature Extraction...")
        start_time = time.time()

        if method == True:
            method = Names.ELI5_PERMUTATION

        if method == Names.ELI5_PERMUTATION:
            pi_object = PermutationImportance(self.base_run_instance.test_harness_model.model)
            pi_object.fit(self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use],
                          self.base_run_instance.testing_data[self.base_run_instance.col_to_predict]
                          )
            feature_importances_df = pd.DataFrame()
            feature_importances_df["Feature"] = self.base_run_instance.feature_cols_to_use
            feature_importances_df["Importance"] = pi_object.feature_importances_
            feature_importances_df["Importance_Std"] = pi_object.feature_importances_std_
            feature_importances_df.sort_values(by='Importance', inplace=True, ascending=False)
            self.feature_importances = feature_importances_df.copy()
        elif method == Names.RFPIMP_PERMUTATION:
            pis = rfpimp.importances(self.base_run_instance.test_harness_model.model,
                                     self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use],
                                     self.base_run_instance.testing_data[self.base_run_instance.col_to_predict])
            pis['Feature'] = pis.index
            pis.reset_index(inplace=True, drop=True)
            pis = pis[['Feature', 'Importance']]
            pis.sort_values(by='Importance', inplace=True, ascending=False)
            self.feature_importances = pis.copy()
        elif method == "sklearn_rf_default":
            pass  # TODO

        elif method == Names.BBA_AUDIT:
            self.bba_plots_dict = {}
            data = self.perform_bba_audit(training_data=self.base_run_instance.training_data.copy(),
                                          testing_data=self.base_run_instance.testing_data.copy(),
                                          features=self.base_run_instance.feature_cols_to_use,
                                          classifier=self.base_run_instance.test_harness_model.model,
                                          col_to_predict=self.base_run_instance.col_to_predict)
            feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"])
            self.feature_importances = feature_importances_df.copy()

        elif method == Names.SHAP_AUDIT:
            self.shap_plots_dict = {}
            data = self.perform_shap_audit()
            feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"])
            self.feature_importances = feature_importances_df.copy()

        print(("Feature Extraction time with method {0} was: {1:.2f} seconds".format(method, time.time() - start_time)))
Esempio n. 13
0
def feature_import(model, X_test, y_test):
    '''
    determine relative feature importance in model using permuation importance.
    dependency: import rfpimp  
    input:
        model = sklearn model (ex. model = sklearn.regressor() )
        X_test = numpy array. contains features (not target)
        y_test = numpy array. contains target only
    '''
    # permutation importances returns a df with feature, importance columns
    colnames = X_test.columns
    X_test_df = pd.DataFrame(X_test)
    y_test_df = pd.DataFrame(y_test)
    imp = rfpimp.importances(model, X_test_df, y_test_df, colnames)
    viz = rfpimp.plot_importances(imp)
    print("Permutation Feature Importance")
    viz.view()
    return imp
Esempio n. 14
0
    def score(self, X_test, y_test, use_rfpimp=True, use_sklearn=True):

        print(
            "The following are the results of a random forest fit to the original features appended to the Weights Matrix:"
        )
        print("\naccuracy:", round(self.model.score(X_test, y_test), 3))
        print("precision:", round(precision_score(y_test, self.y_pred), 3))
        print("recall:", round(recall_score(y_test, self.y_pred), 3))

        pimp_imps = rfpimp.importances(self.model, self.X, self.y)
        rfpimp.plot_importances(
            pimp_imps[0:9],
            yrot=0,
            label_fontsize=12,
            width=12,
            minheight=1.5,
            vscale=2.0,
            imp_range=(0, pimp_imps['Importance'].max() + .03),
            color='#484c51',
            bgcolor='#F1F8FE',  # seaborn uses '#F1F8FE'
            xtick_precision=2,
            title='Permutation Importances')

        if use_sklearn == True:
            importances = self.model.feature_importances_
            indices = np.argsort(importances)[::-1]

            print("\nFeature ranking:")
            for feat in range(0, 10):
                print("%d. %s (%f)" % (feat + 1, self.X.columns[indices[feat]],
                                       importances[indices[feat]]))

            # plot feat imps
            plt.figure(figsize=(12, 6))
            plt.ylabel('Feature Name', size=12)
            plt.xlabel('Relative Feature Importance', size=12)
            plt.title('Sklearn Feature Importances', size=18)
            feat_importances = pd.Series(importances, index=self.X.columns)
            feat_importances.nlargest(10).plot(kind='barh')
            plt.grid(color='grey', ls=':')
            plt.show()
Esempio n. 15
0
def permutation_importances(clf, val_x, val_y, viz=False, log=True):
	out_dict = {}

	# Get feature importances via permutation
	if log:
		sys.stderr.write("o Feature permutation importances:\n\n")
	imp = importances(clf, val_x, val_y)
	for i in range(len(imp["Importance"])):
		key, val = imp.index[i], imp["Importance"].values[i]
		out_dict[key] = val
		if log:
			sys.stderr.write(key + "=" + str(val) + "\n")
	if viz:
		viz = plot_importances(imp)
		viz.view()

		viz = plot_corr_heatmap(val_x, figsize=(7,5))
		viz.view()
	if log:
		sys.stderr.write("\n")

	return out_dict
Esempio n. 16
0
              metrics.precision_score(VAL_Y, PRD_Y, average='weighted'),
              metrics.recall_score(VAL_Y, PRD_Y, average='weighted'),
              metrics.jaccard_score(VAL_Y, PRD_Y, average='weighted'))
 report = metrics.classification_report(VAL_Y, PRD_Y)
 confusionMat = metrics.plot_confusion_matrix(
     rf,
     VAL_X,
     VAL_Y,
     display_labels=list(range(len(set(outputs[outputs.columns[0]])))),
     cmap=cm.Blues,
     normalize=None)
 plt.savefig(strMod + '_RF.jpg', dpi=300)
 featImportance = list(rf.feature_importances_)
 impDC = rfp.oob_dropcol_importances(rf, TRN_X, TRN_Y.values.ravel())
 impDCD = impDC.to_dict()['Importance']
 impPM = rfp.importances(rf, TRN_X, TRN_Y)
 impPMD = impPM.to_dict()['Importance']
 ###########################################################################
 # Interpretability Plots
 ###########################################################################
 feat = FEATS[4]
 for feat in FEATS:
     isolate = pdp.pdp_isolate(model=rf,
                               dataset=TRN_X,
                               model_features=FEATS,
                               feature=feat)
     fracPlot = 2500
     (fig, axes) = pdp.pdp_plot(pdp_isolate_out=isolate,
                                feature_name=feat,
                                center=False,
                                x_quantile=True,
Esempio n. 17
0
def get_importance(args: argparse.Namespace) -> None:
    """

    Source: https://explained.ai/rf-importance/index.html

    :param args: path/to/data/file.xlsx
    :return: feature importance, fit model, gridsearch results, and data transform mask.
    """

    select_from_model = False
    transform_first = False

    input_ = args.input

    p = Path(input_)
    p = p.parent
    p = p.parent

    importance = p / 'importance'
    model_checkpoints = p / 'model_checkpoints'
    rf_best_params = p / 'rf_best_params'
    transform_mask = p / 'transform_mask'

    if not importance.exists():
        importance.mkdir()

    if not model_checkpoints.exists():
        model_checkpoints.mkdir()

    if not rf_best_params.exists():
        rf_best_params.mkdir()

    if not transform_mask.exists():
        transform_mask.mkdir()

    df_orig = pd.read_excel(input_)

    orig = df_orig.as_matrix()[:, 1:]

    feature_names = list(df_orig.columns)[1:-1]

    whereNan = np.isnan(list(orig[:, -1]))

    olds = orig[np.logical_not(whereNan)]

    news = orig[whereNan]

    y_train = olds[:, -1]
    X_train = olds[:, :-1]

    X_test = news[:, :-1]

    Xdf = pd.DataFrame(X_train, columns=feature_names)
    ydf = pd.Series(y_train)

    # Initial feature elimination if you have a predetermined mask
    if transform_first is True:
        transform_mask_init = pd.read_csv(
            '../transform_mask/Transform_FILENAME_HERE.csv')
        X_train = X_train[:, transform_mask_init['0'].as_matrix()]

        print("The initially masked Xdf is shape: ")
        print(X_train.shape)

        truth_series = pd.Series(transform_mask_init['0'], name='bools')
        Xdf = pd.DataFrame(Xdf.iloc[:, truth_series.values])

        # save_new_df = pd.DataFrame(X_train)
        # Xdf.to_excel("test_new_cols_1.xlsx")
        # save_new_df.to_excel("test_1.xlsx")

    # Feature elimination based on importance and Select From Model method
    if select_from_model is True:
        print("Selecting the best features in your dataset.")
        rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                    random_state=42,
                                                    bootstrap=True,
                                                    n_estimators=2000,
                                                    max_features=0.5)

        print("The original Xdf is shape: ")
        print(X_train.shape)

        select_fm = sklearn.feature_selection.SelectFromModel(
            estimator=rf, threshold=-np.inf, max_features=8)

        select_fm.fit_transform(X_train, y_train)

        feature_conds = select_fm.get_support()
        transform_df = pd.DataFrame(feature_conds)
        transform_df.to_csv(
            str(transform_mask) + "/Transform_FILENAME_HERE" +
            str(time.strftime("%Y-%m-%d-%I-%M")) + ".csv")
        X_train = select_fm.transform(X_train)

        print("Finished transforming the data; new xdf shape is: ")
        print(X_train.shape)

        Xdf = Xdf[Xdf.columns[feature_conds]]

    rf = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                random_state=42,
                                                bootstrap=True)

    gs = sklearn.model_selection.GridSearchCV(
        rf,
        param_grid={
            'n_estimators': [i for i in range(10, 110, 10)],
            'criterion': ['mse', 'mae'],
            'max_features': [i for i in range(1, X_train.shape[1])]
        },
        scoring='neg_mean_absolute_error',
        cv=5,
        n_jobs=-1,
        refit=True,
        verbose=1)

    print("Optimizing the Hyperparameters. Please be patient.")
    yay = gs.fit(X_train, y_train)

    grid_search_df = pd.DataFrame(gs.cv_results_)
    grid_search_df.to_csv(
        str(rf_best_params) + '/gridsearch_FILENAME_HERE_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv')
    best_results_df = pd.DataFrame(gs.best_params_, index=[0])
    best_results_df.to_csv(
        str(rf_best_params) +
        '/gridsearch_Calphad_FILENAME_HERE_best_params_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.csv')

    rf = sklearn.ensemble.RandomForestRegressor(**yay.best_params_,
                                                random_state=42,
                                                n_jobs=-1,
                                                bootstrap=True,
                                                verbose=0)

    print(
        "Optimal Hyperparameters located. Fitting model to these parameters now."
    )
    rf.fit(X_train, y_train)

    imp = rfpimp.importances(rf, Xdf, ydf)

    viz = rfpimp.plot_importances(imp)
    viz.save(
        str(importance) +
        f'/importances_FILENAME_HERE_-{int(time.time())}.png')
    viz.view()

    dump(
        rf,
        str(model_checkpoints) + '/model_checkpoint_FILENAME_HERE_' +
        str(time.strftime("%Y-%m-%d-%I-%M")) + '.joblib')
Esempio n. 18
0
#dropping players who dont have variance of past season :
df=df[~df['var_ppg_y'].isna()]

#Simple model:
X=df[['ppg_y','MP_y','Age_x','FG%_y','FGA_y','eFG%_y','FT%_y','FTA_y','3P%_y','3PA_y','PF_y','mean_ppg_y','var_ppg_y']]
y=df[['ppg_x']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rf_reg = RandomForestRegressor(max_depth=5, random_state=0,n_estimators=200)
rf_reg.fit(X_train,y_train)

#cross validation:
scores=cross_validate(rf_reg,X=X_train,y=y_train,scoring='neg_mean_squared_error')

#error

-scores['test_score'].mean()


#feature Importances:
imp =importances(rf_reg, X_test, y_test) # permutation
viz = plot_importances(imp,width=6, vscale=2)
viz.view()

#plotting:

plt.scatter(y_test.values,y_test.values-y_pred.reshape(-1,1),alpha=0.3, c='orange')
plt.title('y_test vs residuals')
plt.xlabel('y_test')
plt.ylabel('residuals')
Esempio n. 19
0
def get_multiple_imps(dataset,
                      X, y,
                      X_train, y_train, X_test, y_test,
                      n_shap=300,
                      drop_high_variance_features=True,
                      sortby='Importance',
                      stratpd_min_samples_leaf=15,
                      stratpd_cat_min_samples_leaf=5,
                      imp_n_trials=1,
                      imp_pvalues_n_trials=0,
                      n_stratpd_trees=1,
                      rf_bootstrap=False,
                      bootstrap=True,
                      catcolnames=set(),
                      min_slopes_per_x=5,
                      supervised=True,
                      # include=['Spearman', 'PCA', 'OLS', 'OLS SHAP', 'RF SHAP', "RF perm", 'StratImpact'],
                      normalize=True):
    spear_I = pca_I = ols_I = ols_shap_I = rf_I = perm_I = ours_I = None

    # Do everything now
    include = ['Spearman', 'PCA', 'OLS', 'OLS SHAP', 'RF SHAP', "RF perm", 'StratImpact']
    # include = ['StratImpact']

    if dataset=='bulldozer':
        include.remove('OLS')
        include.remove('OLS SHAP')

    if 'Spearman' in include:
        spear_I = spearmans_importances(X, y)

    if 'PCA' in include:
        pca_I = pca_importances(X)

    if "OLS" in include:
        # since we use coefficients, look at all data
        X_ = StandardScaler().fit_transform(X)
        X_ = pd.DataFrame(X_, columns=X.columns)
        lm = LinearRegression()
        lm.fit(X_, y)
        ols_I, score = linear_model_importance(lm, X_, y)
        print("OLS\n",ols_I)

    if "OLS SHAP" in include:
        # since we use coefficients, look at all data, explain n_shap
        X_ = StandardScaler().fit_transform(X)
        X_ = pd.DataFrame(X_, columns=X.columns)
        lm = LinearRegression()
        lm.fit(X_, y)
        ols_shap_I = shap_importances(lm, X_, X_, n_shap=n_shap)

    if "RF SHAP" in include:
        tuned_params = models[(dataset, "RF")]
        rf = RandomForestRegressor(**tuned_params, n_jobs=-1)
        rf.fit(X_train, y_train)
        rf_I = shap_importances(rf, X_train, X_test, n_shap, normalize=normalize)
        print("RF SHAP\n",rf_I)

    if "RF perm" in include:
        tuned_params = models[(dataset, "RF")]
        rf = RandomForestRegressor(**tuned_params, n_jobs=-1)
        rf.fit(X_train, y_train)
        perm_I = rfpimp.importances(rf, X_test, y_test) # permutation; drop in test accuracy
        print("RF perm\n",perm_I)

    if "StratImpact" in include:
        # RF SHAP and RF perm get to look at the test data to decide which features
        # are more predictive and useful for generality's sake
        # So, we get to look at all data as well, not just training data
        # Actually we use just training again after fixing featimp measure. (May 17, 2020)
        ours_I = featimp.importances(X_train, y_train,
                                     verbose=False,
                                     sortby=sortby,
                                     min_samples_leaf=stratpd_min_samples_leaf,
                                     cat_min_samples_leaf=stratpd_cat_min_samples_leaf,
                                     n_trials=imp_n_trials,
                                     pvalues=imp_pvalues_n_trials > 0,
                                     pvalues_n_trials=imp_pvalues_n_trials,
                                     n_trees=n_stratpd_trees,
                                     bootstrap=bootstrap,
                                     rf_bootstrap=rf_bootstrap,
                                     catcolnames=catcolnames,
                                     min_slopes_per_x=min_slopes_per_x,
                                     supervised=supervised,
                                     normalize=normalize,
                                     drop_high_stddev=2.0 if drop_high_variance_features else 9999)
        print("OURS\n",ours_I)

    if "PDP" in include:
        tuned_params = models[(dataset, "RF")]
        rf = RandomForestRegressor(**tuned_params, n_jobs=-1)
        rf.fit(X, y)
        pdpy = featimp.friedman_partial_dependences(rf, X, mean_centered=True)
        pdp_I = pd.DataFrame(data={'Feature': X.columns})
        pdp_I = pdp_I.set_index('Feature')
        pdp_I['Importance'] = np.mean(np.mean(np.abs(pdpy)), axis=1)

    d = OrderedDict()
    d['Spearman'] = spear_I
    d['PCA'] = pca_I
    d['OLS'] = ols_I
    d['OLS SHAP'] = ols_shap_I
    d['RF SHAP'] = rf_I
    d["RF perm"] = perm_I
    d['Strat'] = ours_I

    # Put both orders for Strat approach into same imps dictionary
    I = featimp.Isortby(ours_I, 'Importance')
    d['StratImport'] = pd.DataFrame(I['Importance'])

    I = featimp.Isortby(ours_I, 'Impact')
    d['StratImpact'] = pd.DataFrame(I['Impact'])

    print(d['StratImport'])
    print(d['StratImpact'])
    return d
Esempio n. 20
0
    def eval_importance(
        self,
        groups: Optional[Dict[str, Sequence[str]]] = None,
        n_times: int = 10,
        ignore: Optional[Union[str, Sequence[str]]] = None,
        n_jobs: int = 1,
    ) -> pd.DataFrame:
        """
        Evaluate permutation feature importances.

        Args:
            groups: Groups of related features. One feature can appear
                on several groups at the same time.
            n_times: Number of times to calculate importances. Uses the
                mean of results.
            ignore: Features to ignore during dropping.
            n_jobs: Number of CPUs to use. -1 to use all available.

        Returns:
            DataFrame
        """
        # Prepare features list (or nested list).
        features = self.features
        if ignore:
            features = [feat for feat in features if feat not in ignore]
        if groups:
            features = self._manage_groups(groups, features)

        # Split dataset.
        n_samples = 5000
        ratio = 0.2
        datasets = autolearn.split(x=self._x,
                                   y=self._y,
                                   test_samples=n_samples,
                                   test_ratio=ratio)
        x_train, x_test, y_train, y_test = datasets

        model = autolearn.Model(task=self.task)
        model.tune(x_train, y_train, test_ratio=ratio, n_jobs=n_jobs)
        model.fit(x_train, y_train)

        kwargs = {
            "model": model,
            "X_valid": x_test,
            "y_valid": y_test,
            "features": features,
            "n_samples": -1,
        }

        # Get importances.
        imps = [rfpimp.importances(**kwargs) for _ in range(n_times)]
        imp = pd.concat(imps).groupby(level=0).mean()
        imp = imp.sort_values("Importance", ascending=False)

        # Create new columns.
        # Handle Negative values by adding its module to all values.
        non_negatives = imp["Importance"].add(np.abs(imp["Importance"].min()))
        imp["Normalised Importance"] = non_negatives / non_negatives.sum()
        imp["Cumulative Importance"] = imp["Normalised Importance"].cumsum()

        self._importances = imp
        return imp
Esempio n. 21
0
    skplt.metrics.plot_roc(y, test_probs)
    clf_names = ['SVM']
    skplt.metrics.plot_calibration_curve(y, [test_probs], clf_names)
    plot_learning_curve(svm, "SVM", Xenc, Y)

if args.importances:
    rf = RandomForestClassifier(n_jobs=-1)
    Xnum = X.drop(['cc_type', 'diff_addresses'], axis=1)
    xnum = x.drop(['cc_type', 'diff_addresses'], axis=1)
    Xpoly = poly.fit_transform(Xnum)
    xpoly = poly.transform(xnum)
    Xpoly = pd.DataFrame(Xpoly, columns=poly.get_feature_names(Xnum.columns))
    xpoly = pd.DataFrame(xpoly, columns=poly.get_feature_names(xnum.columns))
    rf.fit(Xnum, Y)
    # here we assume there are no categorical variables
    imp = importances(rf, xnum, y)  # permutation
    plot_importances(imp, figsize=(8, 12))
    plot_corr_heatmap(Xnum, figsize=(11, 11))

if args.randomgrid:
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start=1000, stop=2500, num=4)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
random_sel = [10 * i + 10 for i in range(0,10)]
ax.plot(cumulative_sum_clients,random_sel,label="Baseline")
plt.legend(loc="upper left")

#plt.savefig('Cumulative_gain.png', bbox_inches='tight')
#pickle.dump(fig,open("Cumulative_gain.pickle","wb"))

# endregion


# region Group permutaion
#To get an output which is easier to read (importance not splitted across correlated features) one should use either conditional permutation importance
#(not available in python) or resort to use the library rfpimp to compute group permutations.

np.random.seed(123)
group_imp=importances(best_rf.named_steps["classifier"],X_test,y_test,features=list(cluster_feature),metric=custom_scorer)
fig, ax = plt.subplots()
ax.set(xlabel="Drop in $F_2$ score when the variable is perturbed")
plot_importances(group_imp,ax=ax)
plt.xticks(np.arange(min(group_imp.values), max(group_imp.values)+0.03, 0.01))
fig.set_size_inches(10,10)
ax.set_xlim([0, 0.10])
fig.tight_layout()




#plt.savefig('Feature_importance_group.png', bbox_inches='tight')
#pickle.dump(fig,open("Feature_importance_group.pickle","wb"))
# endregion
Esempio n. 23
0
 def importance(
         self):  # permutation feature importance using rfpimp library
     imp = importances(self.rf_reg, self.X_test, self.y_test)
     viz = plot_importances(imp, width=6, vscale=2)
     viz.view()
     print(imp)