Example #1
0
# Save data into files
do_save = input('Do you want to save trained model for final submission? (y/n)')
if do_save:
    dump(X_feat_list, os.path.join(dir_out, "featlist.joblib") )
    dump(scaler, os.path.join(dir_out, "scaler.joblib"))
    dump(clf_final, os.path.join(dir_out, "filename.joblib"))
    dump(best_th_pr, os.path.join(dir_out, "bestTHR.joblib"))
    dump(imputer, os.path.join(dir_out, "imputer.joblib"))


#%%
do_pi = input('Do you want to compute Permutation Importance? (y/n)')
if do_pi == 'y':
    from eli5.sklearn import PermutationImportance

    perm = PermutationImportance(clf_final, random_state=1).fit(X_new, y)

    results_0 = perm.results_[0]

    results_mean = np.zeros(results_0.shape)
    results_std  = np.zeros(results_0.shape)

    perm_means = np.mean(perm.results_, axis=0)
    perm_stds  = np.std (perm.results_, axis=0)

    results_0_copy = np.copy(perm_means)

    variable_to_show = 15

    importances_normalized = results_0_copy
    indices_sorted = np.argsort(importances_normalized)[::-1]
Example #2
0
 def permutation_importance(self, model, X_val, y_val):
     '''check Feature importance on the Validation data for the fitted model'''
     perm = PermutationImportance(model, random_state=1).fit(X_val, y_val)
     return eli5.show_weights(perm, feature_names = X_val.columns.tolist())
Example #3
0

def sort_by_imp(train_cols, importance):
    return sorted(zip(train_cols, importance),
                  key=lambda x: x[1],
                  reverse=True)


# Importances from models
logreg_imp = sort_by_imp(train_cols, logreg.coef_[0])
rf_imp = sort_by_imp(train_cols, rf.feature_importances_)

# Try permutation importance
perm_imps = {}
for label, model in [('rf', rf), ('logreg', logreg)]:
    perm = PermutationImportance(model, n_iter=10,
                                 cv='prefit').fit(X_valid, y_valid)
    perm_imp = sort_by_imp(train_cols, perm.feature_importances_)
    perm_imps[label] = perm_imp

    fig, ax = plt.subplots()
    ax = sns.boxplot(data=np.array(perm.results_), ax=ax)
    _ = ax.set_xticklabels(train_cols, rotation=90)
    _ = ax.set_ylabel('Improvement in log_loss')
    _ = ax.set_title(label)
    fig.tight_layout()
    plt.show(block=False)
"""
train_cols = [x[0] for x in perm_imp if x[1] > 0]
"""

# Try out SHAP
Example #4
0
Heatmap makes it easy to identify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library.
'''
import pandas as pd
import numpy as np
import seaborn as sns
data = pd.read_csv("D://Blogs//train.csv")
X = data.iloc[:, 0:20]  #independent columns
y = data.iloc[:, -1]  #target column i.e price range
#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20, 20))
#plot heat map
g = sns.heatmap(data[top_corr_features].corr(), annot=True, cmap="RdYlGn")
'''
Permutation Importance:
In Permutation Importance we ask Instead we will ask the following question:
If I randomly shuffle a single column of the validation data, leaving the target and all other columns in place,
how would that affect the accuracy of predictions in that now-shuffled data?
'''

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names=val_X.columns.tolist())
Example #5
0
  # plt.title('True label:' + str(N_test[i,-2]) + '   likelihood of label ' + str(N_test[i,-2]) + ': ' + str(softmax1_cnn[i][int(y_test[i])]))
  plt.title('True label:' + str(y_test[i]) + '   likelihood of label ' + str(y_test[i]) + ': ' + str(softmax1_cnn[i][int(y_test[i])]))
  plt.clim(0.003,0.010)
  plt.colorbar()
  plt.show

#permutation feature weights

import eli5
from eli5 import format_as_image
from eli5.sklearn import PermutationImportance
from sklearn.neural_network import MLPClassifier
NNMLP_clf = MLPClassifier(random_state=48, max_iter=50)
NNMLP_clf.fit(new_last_conv1, y_test1[:])

perm_all = PermutationImportance(NNMLP_clf).fit(new_last_conv1, y_test1)
print('CNN results')
exp = eli5.explain_weights_df(perm_all, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_corr = PermutationImportance(NNMLP_clf).fit(new_last_conv1[correct_cnn[:]], y_test1[[correct_cnn[:]]])
print('CNN Correct results')
exp_corr = eli5.explain_weights_df(perm_corr, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_mis = PermutationImportance(NNMLP_clf).fit(new_last_conv1[misclass_cnn[:]], y_test1[misclass_cnn[:]])
print('CNN Misclass results')
exp_mis = eli5.explain_weights_df(perm_mis, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

from sklearn.preprocessing import normalize

n0= normalize(final_last_conv1[correct_cnn[:]])
n1= normalize(final_last_conv1[misclass_cnn[:]])
    def QC(self, cleaned_Data_frm, cleaned_Data_frm1, y, cursor, conn):
        #         try:
        print('Models Building')
        float_cols = self.float_col
        result = pd.concat(
            [cleaned_Data_frm, cleaned_Data_frm1, y, float_cols], axis=1)
        self.data_sorted1 = result.sort_values(self.i)
        self.data_sorted = self.data_sorted1.loc[:, ~self.data_sorted1.columns.
                                                 duplicated()]
        print(self.data_sorted.shape)
        new_list = [
            list(set(self.x.columns).difference(self.data_sorted.columns))
        ]
        uploaded_cols = []
        for self.col in list(
                self.data_sorted.select_dtypes(include=[np.float64])):
            if not self.col in uploaded_cols:
                print(self.col)
                X = self.data_sorted.drop([self.col], axis=1)
                y = self.data_sorted[self.col]
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=self.test_size, random_state=42)
                X_train, X_test = train_test(X_train, X_test)
                print(X_train.shape)
                Modles_reuslts = []
                Names = []
                target = self.col
                print('Models Building')
                models = ['Random Forest', 'KNN', 'XGB', 'SVR']
                l = 0
                features = []
                for Regressor, params, model in zip(self.Regressor,
                                                    self.Regressor_grids,
                                                    models):
                    print(model)
                    gd = RandomizedSearchCV(Regressor,
                                            params,
                                            cv=5,
                                            n_jobs=-1,
                                            verbose=True)
                    gd.fit(X_train, y_train)
                    y_pred = gd.predict(X_test)
                    random_best = gd.best_estimator_.predict(X_test)
                    errors = abs(random_best - y_test)
                    mape = np.mean(100 * (errors / y_test))
                    Accuracy = 100 - mape
                    grid = gd.best_params_
                    estimator = gd.best_estimator_
                    l = +1
                    if model == 'KNN':
                        perm = PermutationImportance(gd, random_state=1).fit(
                            X_train, y_train)
                        importances = perm.feature_importances_
                        DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                                  importances, grid, estimator, l, None,
                                  target, model)
                    elif model == 'SVR':
                        weights = gd.best_estimator_.coef_
                        test = ' '.join(str(weights).split())
                        #replace double whitespace with comma
                        test = test.replace(" ", ",")
                        #str to json loads
                        lists = json.loads(test)
                        importances = lists[0]
                        DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                                  importances, grid, estimator, l, None,
                                  target, model)
                    else:
                        importances = gd.best_estimator_.feature_importances_.tolist(
                        )  #._final_estimator
                        print(Accuracy)
                        features.append(importances)
                        DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                                  importances, grid, estimator, l, None,
                                  target, model)

                def Reg_model():
                    model = Sequential()
                    model.add(
                        Dense(500,
                              input_dim=X_train.shape[1],
                              activation="relu"))
                    model.add(Dense(100, activation="relu"))
                    model.add(Dense(50, activation="relu"))
                    model.add(Dense(1))
                    model.compile(loss="mean_squared_error",
                                  optimizer="adam",
                                  metrics=["accuracy"])
                    return model

                model = KerasClassifier(build_fn=Reg_model, verbose=0)
                # define the grid search parameters
                batch_size = [10, 20, 40, 60, 80, 100]
                epochs = [10, 50, 100]
                param_grid = dict(batch_size=batch_size, epochs=epochs)
                grid = GridSearchCV(estimator=model,
                                    param_grid=param_grid,
                                    n_jobs=-1,
                                    cv=3)
                grid_result = grid.fit(X_train, y_train)
                grid = grid.best_params_
                model = 'DNN'
                print("DNN", features)
                DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                          features[0], grid, grid, l, None, target, model)
                print("Best: %f using %s" %
                      (grid_result.best_score_, grid_result.best_params_))
Example #7
0
import eli5
from eli5.sklearn import PermutationImportance

n_samples = 20000

# Create array holding predictive feature
X1 = 4 * rand(n_samples) - 2
X2 = 4 * rand(n_samples) - 2
# Create y. you should have X1 and X2 in the expression for y
y = X1*X2

# create dataframe because pdp_isolate expects a dataFrame as an argument
my_df = pd.DataFrame({'X1': X1, 'X2': X2, 'y': y})
predictors_df = my_df.drop(['y'], axis=1)

my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)


pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')
pdp.pdp_plot(pdp_dist, 'X1')
plt.show()

perm = PermutationImportance(my_model).fit(predictors_df, my_df.y)

# Check your answer
q_7.check()

# show the weights for the permutation importance you just calculated
eli5.show_weights(perm, feature_names = ['X1', 'X2'])
Example #8
0
    def optimize_pipeline(self, seq, X, y):
        """
        Constructs and optimizes a pipeline according to the steps passed through `seq` which is a tuple of
        estimators and transformers.

        :param seq: the tuple of steps of the pipeline to be optimized
        :param X: numpy array of training features
        :param y: numpy array of training values
        :return: the optimized pipeline and its score
        """
        from .structsearch import SurrogateRandomCV
        if self.couldBfirst == []:
            from sklearn.pipeline import Pipeline
        else:
            from imblearn.pipeline import Pipeline
        OPTIM = None
        n = len(seq)
        idx = 0
        ent_idx = 0
        steps = []
        config = {}
        task_name = self.check_point + '_'.join(seq)
        while ent_idx < n:
            est = seq[ent_idx]
            clss = self._get_class(est)
            pre = 'stp_%d' % idx
            if self.config_types[est] in ['regressor', 'classifier'] and ent_idx < n - 1:
                mdl = clss()
                steps.append((pre, StackingEstimator(mdl, res=self.stack_res,
                                                     probs=self.stack_probs,
                                                     decision=self.stack_decision)))
                ent_idx += 1
            elif est == 'sklearn.pipeline.FeatureUnion':
                self.config[est] = dict()
                int_idx = 1
                int_steps = []
                next_est = seq[ent_idx + int_idx]
                while ((self.config_types[next_est] in ['regressor', 'classifier']) or (
                        next_est in self.known_feature_selectors)) and (ent_idx + int_idx < n - 1):
                    int_pre = "int_%d" % int_idx
                    if next_est in self.known_feature_selectors:
                        int_mdl = self._get_class(next_est)()
                        # set the parameter's dictionary
                        for kw in self.config[next_est]:
                            self.config[est][int_pre + '__' + kw] = self.config[next_est][kw]
                    else:
                        from eli5.sklearn import PermutationImportance
                        from sklearn.feature_selection import SelectFromModel
                        from numpy import inf
                        int_est = self._get_class(next_est)()
                        int_mdl = SelectFromModel(PermutationImportance(int_est, cv=3),
                                                  threshold=-inf)
                        self.config[est][int_pre + '__' + 'max_features'] = Integer(1, self.num_features)
                        for kw in self.config[next_est]:
                            self.config[est][int_pre + '__' + 'estimator__estimator__' + kw] = \
                                self.config[next_est][kw]
                    int_steps.append((int_pre, int_mdl))
                    int_idx += 1
                    next_est = seq[ent_idx + int_idx]
                if int_steps != []:
                    mdl = clss(int_steps)
                    steps.append((pre, mdl))
                ent_idx += int_idx
            else:
                mdl = clss()
                steps.append((pre, mdl))
                ent_idx += 1
            for kw in self.config[est]:
                config[pre + '__' + kw] = self.config[est][kw]
            idx += 1
        ppln = Pipeline(steps)
        if self.verbose > 0:
            print("=" * 90)
            print(seq)
            print("-" * 90)
        for srgt in self.surrogates:
            OPTIM = SurrogateRandomCV(ppln,
                                      params=config,
                                      max_iter=srgt[1],
                                      min_evals=self.min_random_evals,
                                      scoring=self.scoring,
                                      cv=self.cv,
                                      verbose=max(self.verbose - 1, 0),
                                      sampling=srgt[2],
                                      regressor=srgt[0],
                                      scipy_solver=srgt[3],
                                      task_name=task_name,
                                      Continue=True,
                                      warm_start=True)
            OPTIM.fit(X, y)
        return OPTIM.best_estimator_, OPTIM.best_estimator_score
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
    'dropoff_latitude', 'passenger_count'
]

X = data[base_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
first_model = RandomForestRegressor(n_estimators=50,
                                    random_state=1).fit(train_X, train_y)

# show data
print("Data sample:")
print(data.head())

# Show permutation importance
perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names=val_X.columns.tolist())
print(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=val_X.columns.tolist())))

############
### Creating new features
############

data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

features_2 = [
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
    'dropoff_latitude', 'abs_lat_change', 'abs_lon_change'
Example #10
0
def main():
    data_path = './'
    filename = 'task_data.csv'
    
    data = pd.read_csv(data_path+filename, index_col='sample index')
    y_tot1 = data.pop('class_label')
    y_tot = y_tot1.replace(-1, 0)
    feat_cols = data.columns
    X_train, X_test, y_train, y_test = train_test_split(data, y_tot, test_size = 0.33, random_state=42)


    model = xgboost.XGBClassifier(objective='binary:logistic')
    model.fit(X_train,y_train)

    pred = model.predict_proba(X_test)[:, 1]
    predictionsT = model.predict(X_train)
    predictions = model.predict(X_test)
    print( classification_report(y_test.values, pred.round()))
    auc = roc_auc_score(y_test.values, pred)
    print( "Area under ROC curve: %.4f"%(auc))
    print('test')
    print(confusion_matrix(y_test, predictions))

    plot_confusion_matrix(model, X_test, y_test)
    plt.savefig("Plots/conf_m.png", transparent=True)


    # drop column 
    m1 = xgboost.XGBClassifier(objective='binary:logistic') 
    im,co = dropcol_importances(m1 ,X_train,y_train,X_test,y_test)
    impdrop = perf_out(im,co,"Drop Column")
    print(impdrop)


    # Permutation importance
    result = permutation_importance(model, X_test, y_test, n_repeats=100, random_state=41)
    impper = perf_out(result.importances_mean,X_test.columns,"Permutation Importance")
    print(impper)

    # Permutation importance v2
    perm = PermutationImportance(model, random_state=41).fit(X_test,y_test)
    imppereli5 = perf_out(perm.feature_importances_,X_test.columns,"Permutation Importance ELI5")
    print(imppereli5)

    df_tot = pd.concat([impdrop,impper,imppereli5],axis=1,sort=False)

    if debug==True:
        plot_importance(model)
        plt.savefig("Plots/xgb_importance.png", transparent=True)   

    imp_types = ["weight","gain","cover","total_gain","total_cover"]
    for i in range(len(imp_types)):
        imp_vals = model.get_booster().get_score(importance_type=imp_types[i])
        imp_vals = sorted(imp_vals.items(), key=lambda x: x[1], reverse=True)
        dftype = pd.DataFrame(imp_vals,columns=['Feature',imp_types[i]])
        cur_feats = dftype['Feature'].values
        diff = set(feat_cols)-set(cur_feats)
        if len(diff)!=0:
            null_imp = [0]*len(diff)
            miss_features = zip(diff, null_imp)
            dftype = dftype.append(pd.DataFrame(miss_features,columns=['Feature',imp_types[i]]), ignore_index=True)

        dftype = dftype.set_index('Feature') 
        df_tot = pd.concat([df_tot,dftype],axis=1,sort=False)
        print(dftype)
        if imp_types[i]=="total_gain":
            dftype.to_csv("rank_"+imp_types[i]+".csv")
        if debug == True:
            dftype.plot.barh(y=imp_types[i], label=imp_types[i]).invert_yaxis()
            plt.savefig("Plots/rank_"+imp_types[i]+".png", transparent=True)   

    df_tot = df_tot.sort_values('total_gain', ascending=False)
    df_tot = df_tot/df_tot.max()
    print(df_tot)
    if debug == True:
        df_tot.plot.barh().invert_yaxis()
        plt.savefig("Plots/ranks.png", transparent=True)   
Example #11
0
def module4():
    config = configparser.ConfigParser()
    config.read('./ml_box.ini')
    runtime_settings = config['RUNTIME']
    labelField = runtime_settings['label_field']
    labelField = stripNonAlphanumeric([labelField])[0]  #clean labelField
    dash_data_path = runtime_settings['dash_data_path']

    ingest_settings = config['INGEST']
    data_type = ingest_settings['datatype']

    if data_type == 'sql':
        data_source = ingest_settings['TABLE_NAME']
    else:
        data_source = ingest_settings['file_name']

    #Load best model
    def pickle_load(name):
        PIK = str(name) + ".pickle"
        with open(PIK, "rb") as f:
            temp_item = pickle.load(f)
        return temp_item

    best_model_path = dash_data_path + 'best_model_automl'
    best_model = pickle_load(best_model_path)

    X_train_path = dash_data_path + 'X_train'
    X_train = pickle_load(X_train_path)
    X_test_path = dash_data_path + 'X_test'
    X_test = pickle_load(X_test_path)
    Y_train_path = dash_data_path + 'Y_train'
    Y_train = pickle_load(Y_train_path)
    Y_test_path = dash_data_path + 'Y_test'
    Y_test = pickle_load(Y_test_path)

    def pickle_save(name, item):
        PIK = str(name) + ".pickle"
        with open(PIK, "wb") as f:
            pickle.dump(item, f)

    Y_pred = best_model.predict(X_test)

    #ROC curve
    #calculate the fpr and tpr for all thresholds of the classification
    probs = best_model.predict_proba(X_test)
    preds = probs[:, 1]
    fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
    roc_auc = metrics.auc(fpr, tpr)

    #feature metrics for sensitivity analysis
    #    featureMeans = X_train.mean()
    featureMins = X_train.min()
    featureMaxs = X_train.max()
    featureMetrics = pd.concat([featureMins, featureMaxs], axis=1)
    featureMetrics.columns = ['min', 'max']

    #confusion matrix
    conf = confusion_matrix(y_true=Y_test, y_pred=Y_pred)

    # ## Market Basket
    mb_ignore = runtime_settings['mb_ignore']

    if mb_ignore == 'false':
        clean_df_path = dash_data_path + 'clean_df'
        norm_df_clean = pickle_load(clean_df_path)

        #drop non-categorical columns
        norm_df = norm_df_clean.select_dtypes(
            include=['int64', 'uint8', 'float64'])

        #one more filter - get rid of int columns that are not categorical (tenure)
        #get rid of columns that are never 'negative' (always 1 or greater)
        continuous_int_cols = list(
            norm_df.loc[:, (norm_df <= 0).sum() == 0].columns)
        print("Market basket analysis - removed columns that are never 0:",
              continuous_int_cols)

        norm_df.drop(continuous_int_cols, axis=1, inplace=True)

        #drop target variable
        norm_df.drop(labelField, axis=1, inplace=True)

        # drop collinear columns (ex. dropping _No phone services attributes, since PhoneService_No covers these)
        # NOTE - gives preference for columns appearing first in the data
        cols_to_drop = []
        for x in norm_df.columns:
            for y in norm_df.columns:
                corr = norm_df[x].corr(norm_df[y])
                if x != y and (corr >= 0.99):
                    if y not in cols_to_drop and x not in cols_to_drop:
                        cols_to_drop.append(y)

        norm_df.drop(labels=cols_to_drop, axis=1, inplace=True)

        def encode_units(x):
            if x < 1:
                return 0
            if x >= 1:
                return 1

        basket_sets = norm_df.applymap(encode_units)

        #If support is too high, lower to allow values in
        try:
            frequent_itemsets = apriori(basket_sets,
                                        min_support=0.1,
                                        use_colnames=True,
                                        max_len=3)
            rules = association_rules(frequent_itemsets,
                                      metric="lift",
                                      min_threshold=1)
        except ValueError:
            frequent_itemsets = apriori(basket_sets,
                                        min_support=0.001,
                                        use_colnames=True,
                                        max_len=3)
            rules = association_rules(frequent_itemsets,
                                      metric="lift",
                                      min_threshold=1)

        max_lift = rules[(rules['lift'] >= 1)
                         & (rules['conviction'] != np.inf)]
        market_basket = max_lift.sort_values(by='lift', ascending=False).head(
            100)  #limit to top 100
        market_basket['antecedents'] = market_basket['antecedents'].map(
            lambda x: list(x))
        market_basket['consequents'] = market_basket['consequents'].map(
            lambda x: list(x))

        #Drop baskets containing the same items but reversed
        market_basket['unique'] = (
            market_basket['antecedents'] +
            market_basket['consequents']).apply(lambda x: ' '.join(sorted(x)))

        market_basket.drop_duplicates(subset=['unique'],
                                      keep='first',
                                      inplace=True)  #drop a<->c baskets

        #Calculate top Lift and Support cells, set dummy value in new column for later market basket conditional formatting
        lift_top10_thresh = market_basket['lift'].quantile(
            q=0.9)  #90th percentile threshold #
        market_basket['lift_highlight'] = np.where(
            market_basket['lift'] >= lift_top10_thresh, 1, 0)
        support_top10_thresh = market_basket['support'].quantile(
            q=0.9)  #90th percentile threshold #
        market_basket['support_highlight'] = np.where(
            market_basket['support'] >= support_top10_thresh, 1, 0)

        #    market_basket['lift'] = market_basket['lift'].map(lambda x: '{:.2f}x'.format(x))
        #    market_basket['support'] = market_basket['support'].map(lambda x: '{:.0%}'.format(x))
        market_basket['Basket'] = (
            market_basket['antecedents'] +
            market_basket['consequents']).apply(lambda x: ', '.join(x))

        market_basket = market_basket[[
            'Basket', 'support', 'lift', 'lift_highlight', 'support_highlight'
        ]]
        market_basket.columns = [
            'Basket', 'Support', 'Lift', 'lift_highlight', 'support_highlight'
        ]

        #Remove any baskets whose items are a subset of a larger basket (preference for basket specificity)
        all_baskets = list(market_basket['Basket'].unique())
        all_baskets = [set(x.split(', ')) for x in all_baskets]

        print("Removing basket subsets...")
        remove_baskets = []

        for index, row in market_basket.iterrows():
            b = row['Basket']
            b_set = set(b.split(', '))

            for a in all_baskets:
                if b_set != a and b_set.issubset(
                        a):  #if basket has superset in all_baskets, remove
                    remove_baskets.append(b)

        market_basket = market_basket[~market_basket['Basket'].
                                      isin(remove_baskets)]

        market_basket.reset_index(inplace=True, drop=True)

        # ### Get averages and sums for all other columns by these market baskets

        calc_dict = {}
        calc_dict_cols = []

        for b in market_basket.Basket.values:
            b_parsed = b.split(', ')

            basket_filtered_df = norm_df_clean.copy(
            )  #create df where all items in basket will be true
            for item in b_parsed:
                basket_filtered_df = basket_filtered_df[
                    basket_filtered_df[item] >= 1]

            for c in basket_filtered_df.columns:  #find agg measures of each column
                basket_filtered_column = basket_filtered_df[
                    c]  #in basket universe
                m = basket_filtered_column.mean()
                s = basket_filtered_column.sum()
                cnt = basket_filtered_column.count()

                population_column = norm_df_clean[c]  #in total universe
                p_m = population_column.mean()
                p_s = population_column.sum()
                p_cnt = population_column.count()

                basket_col_name = c + '_basket'
                pop_col_name = c + '_pop'

                if b in calc_dict.keys():  # data
                    calc_dict[b] += [s, m, cnt, p_s, p_m, p_cnt]
                else:
                    calc_dict[b] = [s, m, cnt, p_s, p_m, p_cnt]

                # columns
                for name_of_column in [
                        basket_col_name + '_sum', basket_col_name + '_mean',
                        basket_col_name + '_count', pop_col_name + '_sum',
                        pop_col_name + '_mean', pop_col_name + '_count'
                ]:
                    if name_of_column not in calc_dict_cols:
                        calc_dict_cols.append(name_of_column)

        market_basket_calcs = pd.DataFrame.from_dict(calc_dict,
                                                     orient='index',
                                                     columns=calc_dict_cols)

        #Format calc columns
        #    calc_format = [col for col in market_basket_calcs if col.endswith('sum') or col.endswith('count') or col.endswith('mean')]
        #    market_basket_calcs[calc_format] = market_basket_calcs[calc_format].applymap(lambda x: '{:.2f}'.format(x)
        #                                              if len(str(round(x))) <= 1
        #                                              else '{:.0f}'.format(x))

        market_basket_calcs['Basket'] = market_basket_calcs.index

        #join calcs to market basket
        market_basket = market_basket.merge(market_basket_calcs,
                                            on='Basket',
                                            how='left')

        # Find top 10th percentile for each calculated field
        calc_cols = [
            col for col in market_basket if col.endswith('sum')
            or col.endswith('count') or col.endswith('mean')
        ]
        for col in calc_cols:
            top10_thresh = market_basket[col].quantile(
                q=0.9)  #90th percentile threshold
            market_basket[col + '_highlight'] = np.where(
                market_basket[col] >= top10_thresh, 1, 0)

        #Replace "_" with " = " for readability
    #    market_basket['Basket'] = market_basket['Basket'].str.replace('_', ' = ')

        market_basket_csv_path = dash_data_path + 'market_basket.csv'
        market_basket.to_csv(market_basket_csv_path, index=False)

    # ### Data summary
    norm_df_summary = generateDf()

    data_summary = norm_df_summary.head(1)
    data_summary_csv_path = dash_data_path + 'data_summary.csv'
    data_summary.to_csv(data_summary_csv_path, index=False)
    clean_df_path = dash_data_path + 'clean_df'
    data_post_transform = pickle_load(clean_df_path)
    data_summary_post_transform = data_post_transform.head(1)
    data_summary_post_transform_csv_path = dash_data_path + 'data_summary_post_transform.csv'
    data_summary_post_transform.to_csv(data_summary_post_transform_csv_path,
                                       index=False)
    data_post_transform_csv_path = dash_data_path + 'data_post_transform.csv'
    data_post_transform.to_csv(data_post_transform_csv_path, index=False)

    # ## Permutation-based feature importance

    feature_names = list(X_test.columns.values)
    perm = PermutationImportance(best_model).fit(X_test, Y_test)

    ex = eli5.explain_weights(perm, feature_names=feature_names)

    perm_feature_wt = eli5.formatters.as_dataframe.format_as_dataframe(ex)

    perm_feature_wt = dict(perm_feature_wt[['feature', 'weight']])

    #create perm_feature_wt for pre-dummified data (for single instance prediction tab)
    dummy_memory_path = dash_data_path + 'dummy_memory'
    dummy_memory = pickle_load(dummy_memory_path)
    dummy_memory = {x[:-3]: y
                    for x, y in dummy_memory.items()
                    }  #remove ' = ' separator from parent

    def returnPreDummyCol(
            postDummyCol):  #returns pre-dummification column/parent
        for dummy_parent, dummy_children in dummy_memory.items():
            if postDummyCol in dummy_children:
                return dummy_parent
        return postDummyCol

    perm_feature_wt_predummy = perm_feature_wt.copy()
    perm_feature_wt_predummy = pd.DataFrame(perm_feature_wt_predummy)
    perm_feature_wt_predummy['feature'] = perm_feature_wt_predummy[
        'feature'].map(returnPreDummyCol)
    perm_feature_wt_predummy = perm_feature_wt_predummy[['feature']]
    perm_feature_wt_predummy.drop_duplicates(inplace=True)
    perm_feature_wt_predummy.reset_index(inplace=True, drop=True)

    #Create pre-dummified features_metrics for use in single instance prediction tab
    featureMetricsPreDummy = featureMetrics.copy()
    featureMetricsPreDummy['preDummy'] = featureMetricsPreDummy.index.map(
        returnPreDummyCol)
    featureMetricsPreDummy.drop_duplicates(subset=['preDummy'], inplace=True)
    #erase calculations for predummy variables
    # feature_metrics['mean'] = np.where(feature_metrics.index != feature_metrics['preDummy'], float('nan'), feature_metrics['mean'])
    featureMetricsPreDummy['min'] = np.where(
        featureMetricsPreDummy.index != featureMetricsPreDummy['preDummy'],
        float('nan'), featureMetricsPreDummy['min'])
    featureMetricsPreDummy['max'] = np.where(
        featureMetricsPreDummy.index != featureMetricsPreDummy['preDummy'],
        float('nan'), featureMetricsPreDummy['max'])

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, Y_train)

    rf = RandomForestClassifier()
    rf.fit(X_train, Y_train)

    rf_feature_importances = pd.DataFrame(rf.feature_importances_,
                                          index=X_train.columns,
                                          columns=['importance']).sort_values(
                                              'importance', ascending=False)

    #export pre-dummified data sample for use in single instance prediction
    sample_size = 200
    if X_test.shape[0] < sample_size:
        sample_size = X_test.shape[0]
    rand_prediction = X_test.sample(n=sample_size)

    #Given set of observations, return probability of 1 using best model
    def get_pos_proba_from_x(observation):
        best_model_automl_path = dash_data_path + 'best_model_automl'
        best_model_pipeline = pickle_load(best_model_automl_path)
        probabilities = best_model_pipeline.predict_proba([observation])
        return probabilities[0][1]

    rand_prediction['pos_proba'] = rand_prediction.apply(get_pos_proba_from_x,
                                                         axis=1)

    #create index sorted by pos_proba for eventual slider
    rand_prediction.sort_values(by='pos_proba', inplace=True)

    #get pre-dummified data for same set of indices
    sample_indices = rand_prediction.index

    pre_dummified_clean_df_path = dash_data_path + 'pre_dummified_clean_df'
    pre_dummy_data = pickle_load(pre_dummified_clean_df_path)

    rand_prediction_pre_dummy = pre_dummy_data.loc[sample_indices]
    rand_prediction_pre_dummy['pos_proba'] = rand_prediction['pos_proba']
    rand_prediction_pre_dummy.reset_index(inplace=True, drop=True)

    rf_feature_importances = rf_feature_importances[0:6]

    feature_names = list(X_test.columns.values)

    clf = LogisticRegression(random_state=0,
                             solver='lbfgs',
                             multi_class='multinomial').fit(X_train, Y_train)
    ex = eli5.explain_weights(clf, feature_names=feature_names)
    reg_feature_weight = eli5.formatters.as_dataframe.format_as_dataframe(ex)
    top_3 = reg_feature_weight.head(3)
    bottom_3 = reg_feature_weight.tail(3)

    analysis_lines = []

    first_feature_pos = top_3.iloc[[0]]['feature'].item()
    second_feature_pos = top_3.iloc[[1]]['feature'].item()
    third_feature_pos = top_3.iloc[[2]]['feature'].item()

    second_feature_ratio = top_3.iloc[[1]]['weight'].item() / top_3.iloc[[
        0
    ]]['weight'].item()
    third_feature_ratio = top_3.iloc[[2]]['weight'].item() / top_3.iloc[[
        0
    ]]['weight'].item()

    analysis_lines.append(
        'For predicting %s classes of %s, the strongest predictors are: ' %
        (runtime_settings['pos_label'], runtime_settings['label_field']))
    analysis_lines.append('The strongest predictor is %s' %
                          (first_feature_pos))
    analysis_lines.append(
        'The second strongest predictor is %s with a weight %0.2f%% of %s' %
        (second_feature_pos, second_feature_ratio, first_feature_pos))
    analysis_lines.append(
        'The third strongest predictor is %s with a weight %0.2f%% of %s' %
        (third_feature_pos, third_feature_ratio, first_feature_pos))

    first_feature_neg = bottom_3.iloc[[2]]['feature'].item()
    second_feature_neg = bottom_3.iloc[[1]]['feature'].item()
    third_feature_neg = bottom_3.iloc[[0]]['feature'].item()

    second_feature_ratio = bottom_3.iloc[[
        0
    ]]['weight'].item() / bottom_3.iloc[[1]]['weight'].item()
    third_feature_ratio = bottom_3.iloc[[0]]['weight'].item() / bottom_3.iloc[[
        2
    ]]['weight'].item()

    #    analysis_lines.append('For predicting %s classes of %s, the strongest predictors are: ' % (runtime_settings['neg_label'], runtime_settings['label_field']))
    analysis_lines.append('The strongest predictor is %s' %
                          (first_feature_neg))
    analysis_lines.append(
        'The second strongest predictor is %s with a weight %0.2f%% of %s' %
        (second_feature_neg, second_feature_ratio, first_feature_neg))
    analysis_lines.append(
        'The third strongest predictor is %s with a weight %0.2f%% of %s' %
        (third_feature_neg, third_feature_ratio, first_feature_neg))

    # Time Series Analysis
    # load cleansed data with time variables
    clean_df_time_path = dash_data_path + 'clean_df_time'
    clean_df_time = pickle_load(clean_df_time_path)

    # datetime columns
    date_cols_path = dash_data_path + 'date_cols'
    date_cols = pickle_load(date_cols_path)
    all_date_cols_path = dash_data_path + 'all_date_cols'
    all_date_cols = pickle_load(all_date_cols_path)
    # all other columns
    non_date_cols_path = dash_data_path + 'non_date_cols'
    non_date_cols = pickle_load(non_date_cols_path)
    all_non_date_cols_path = dash_data_path + 'all_non_date_cols'
    all_non_date_cols = pickle_load(all_non_date_cols_path)

    # If ts_ignore = true, skip time series analysis.
    if runtime_settings['ts_ignore'] == 'true':
        print('Ignore Time Series = TRUE. Skipping time series analysis.')
        ts_acf_path = dash_data_path + 'ts_acf'
        pickle_save(ts_acf_path, None)
        ts_runs_path = dash_data_path + 'ts_runs'
        pickle_save(ts_runs_path, None)
        ts_trends_path = dash_data_path + 'ts_trends'
        pickle_save(ts_trends_path, None)
        ts_forecast_path = dash_data_path + 'ts_forecast'
        pickle_save(ts_forecast_path, None)
        ts_best_model_path = dash_data_path + 'ts_best_model'
        pickle_save(ts_best_model_path, None)
    # If no datetime data, there is no analysis to be done. Continue on.
    elif len(date_cols) == 0:
        print('No datetime data. Skipping time series analysis.')
        ts_acf_path = dash_data_path + 'ts_acf'
        pickle_save(ts_acf_path, None)
        ts_runs_path = dash_data_path + 'ts_runs'
        pickle_save(ts_runs_path, None)
        ts_trends_path = dash_data_path + 'ts_trends'
        pickle_save(ts_trends_path, None)
        ts_forecast_path = dash_data_path + 'ts_forecast'
        pickle_save(ts_forecast_path, None)
        ts_best_model_path = dash_data_path + 'ts_best_model'
        pickle_save(ts_best_model_path, None)
    else:
        # resample and get process control stats for all time-feature pairs
        for tc in date_cols:
            for cc in non_date_cols:
                control_stats_here = timeSeriesPreProcessing(
                    clean_df_time, tc, cc)

                if (non_date_cols.index(cc) == 0) & (date_cols.index(tc) == 0):
                    ts_control_stats = control_stats_here
                else:
                    ts_control_stats = ts_control_stats.append(
                        control_stats_here)

            # use process control stats to select which variables to run
            decreasing = ts_control_stats.loc[tc][(
                ts_control_stats.loc[(tc), 'decr'] > 7)]['decr'].sort_values(
                    ascending=False)
            increasing = ts_control_stats.loc[tc][(
                ts_control_stats.loc[(tc), 'incr'] > 7)]['incr'].sort_values(
                    ascending=False)
            below_mn = ts_control_stats.loc[tc][(ts_control_stats.loc[(
                tc), 'blw_mn'] > 7)]['blw_mn'].sort_values(ascending=False)
            above_mn = ts_control_stats.loc[tc][(ts_control_stats.loc[(
                tc), 'abv_mn'] > 7)]['abv_mn'].sort_values(ascending=False)
            below_lcl = ts_control_stats.loc[tc][(ts_control_stats.loc[(
                tc), 'blw_lcl'] > 0)]['blw_lcl'].sort_values(ascending=False)
            above_ucl = ts_control_stats.loc[tc][(ts_control_stats.loc[(
                tc), 'abv_ucl'] > 0)]['abv_ucl'].sort_values(ascending=False)
            top_n = 2  # we will choose top_n features from each category to run against each time variable
            to_run_list_here = list(
                itertools.product([tc], decreasing[:top_n].index)
            ) + list(itertools.product([tc], increasing[:top_n].index)) + list(
                itertools.product([tc], below_mn[:top_n].index)
            ) + list(itertools.product([tc], above_mn[:top_n].index)) + list(
                itertools.product([tc], below_lcl[:top_n].index)) + list(
                    itertools.product([tc], above_ucl[:top_n].index))
            to_run_list_here = list(set(to_run_list_here))

            if date_cols.index(tc) == 0:
                to_run_list = to_run_list_here
            else:
                to_run_list = to_run_list + to_run_list_here

        # remove duplicates - hopefully this step is unnecessary
        to_run_list = list(set(to_run_list))
        print('to run:', to_run_list)

        # save TS process control stats
        ts_control_stats_path = dash_data_path + 'ts_control_stats'
        pickle_save(ts_control_stats_path, ts_control_stats)

        # top 7 most informative features
        perm_feature_wt_df = pd.DataFrame(perm_feature_wt)
        num_features = min((perm_feature_wt_df.shape[0] - 1), 6)
        top_10 = perm_feature_wt_df.sort_values(
            'weight', ascending=False).reset_index(
                drop=True).loc[:num_features, 'feature'].values.tolist()
        top_10.append(labelField)
        #for cc in top_10:
        #    for tc in date_cols:
        for tuple in to_run_list:
            tc = tuple[0]
            cc = tuple[1]

            # optimal drop and resample parameters
            resample_period = ts_control_stats.loc[(tc, cc), 'period']
            drop_first = int(ts_control_stats.loc[(tc, cc), 'drop_first'])
            drop_last = int(ts_control_stats.loc[(tc, cc), 'drop_last'])

            acf_here, trends_here, forecast_here, best_model_here = runTimeSeries(
                clean_df_time, resample_period, drop_first, drop_last, tc, cc)

            # append results to the larger multi-index dfs
            #if (date_cols.index(tc) == 0) & (top_10.index(cc) == 0):
            if to_run_list.index(tuple) == 0:
                ts_acf = acf_here
                ts_trends = trends_here
                ts_forecast = forecast_here
                ts_best_model = best_model_here
                ts_runs = pd.DataFrame(data={
                    'time_var': [tc],
                    'feature': [cc]
                })
            else:
                ts_acf = ts_acf.append(acf_here)
                ts_trends = ts_trends.append(trends_here)
                ts_forecast = ts_forecast.append(forecast_here)
                ts_best_model = ts_best_model.append(best_model_here)
                ts_runs = ts_runs.append({
                    'time_var': tc,
                    'feature': cc
                },
                                         ignore_index=True)

            ts_acf_path = dash_data_path + 'ts_acf'
            pickle_save(ts_acf_path, ts_acf)
            ts_runs_path = dash_data_path + 'ts_runs'
            pickle_save(ts_runs_path, ts_runs)
            ts_trends_path = dash_data_path + 'ts_trends'
            pickle_save(ts_trends_path, ts_trends)
            ts_forecast_path = dash_data_path + 'ts_forecast'
            pickle_save(ts_forecast_path, ts_forecast)
            ts_best_model_path = dash_data_path + 'ts_best_model'
            pickle_save(ts_best_model_path, ts_best_model)

    # ## Prepare for export

    x_100_test = X_test[:100]
    # print(x_100_test.shape)

    #Time to classify 100 new samples
    start = time.clock()

    best_model.predict(x_100_test)

    elapsed = time.clock() - start
    # print(elapsed)

    #Prepare various metrics for export
    model_type = (' ').join(list(best_model.named_steps.keys()))
    timestamp = datetime.now()
    params_json = best_model.get_params()
    precision = precision_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    log_loss_score = log_loss(Y_test, Y_pred)
    #    mae = mean_absolute_error(Y_test, Y_pred)
    #    mse = mean_squared_error(Y_test, Y_pred)
    # roc_auc = #roc_auc already defined above
    test_item_count = Y_test.count()
    run_time = elapsed

    #More metrics
    file_name = data_source
    row_count = norm_df_summary.shape[0]
    col_count = norm_df_summary.shape[1]
    target_variable = labelField

    model_metrics = [
        model_type,
        timestamp,
        params_json,
        precision,
        recall,
        f1,
        accuracy,
        log_loss_score,
        #        mae,
        #        mse,
        roc_auc,
        test_item_count,
        run_time,
        rf_feature_importances,
        file_name,
        row_count,
        col_count,
        target_variable,
        analysis_lines
    ]

    # In[447]:

    model_metrics_columns = [
        'model_type',
        'timestamp',
        'params_json',
        'precision',
        'recall',
        'f1',
        'accuracy',
        'log_loss_score',
        #        'mae',
        #        'mse',
        'roc_auc',
        'test_item_count',
        'run_time',
        'rf_feature_importances',
        'file_name',
        'row_count',
        'col_count',
        'target_variable',
        'analysis_lines'
    ]

    metrics_df = dict(zip(model_metrics_columns, model_metrics))

    metrics_df_path = dash_data_path + 'metrics_df'
    pickle_save(metrics_df_path, metrics_df)
    perm_feature_wt_path = dash_data_path + 'perm_feature_wt'
    pickle_save(perm_feature_wt_path, perm_feature_wt)
    rand_prediction_path = dash_data_path + 'rand_prediction'
    pickle_save(rand_prediction_path, rand_prediction_pre_dummy)
    perm_feature_wt_predummy_path = dash_data_path + 'perm_feature_wt_predummy'
    pickle_save(perm_feature_wt_predummy_path, perm_feature_wt_predummy)
    featureMetricsPreDummy_path = dash_data_path + 'featureMetricsPreDummy'
    pickle_save(featureMetricsPreDummy_path, featureMetricsPreDummy)
    fpr_path = dash_data_path + 'fpr'
    pickle_save(fpr_path, list(fpr))
    tpr_path = dash_data_path + 'tpr'
    pickle_save(tpr_path, list(tpr))
    #    rf_explanation_example_path = dash_data_path + 'rf_explanation_example'
    #    pickle_save(rf_explanation_example_path, rf_explanation_example)
    featureMetrics_path = dash_data_path + 'featureMetrics'
    pickle_save(featureMetrics_path, featureMetrics)
    conf_matrix_path = dash_data_path + 'conf_matrix'
    pickle_save(conf_matrix_path, conf)
Example #12
0
X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.fit_transform(X_val) 

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)



#Permutation Importance
import eli5
from eli5.sklearn import PermutationImportance

#1. Calculate permutation importances
permuter = PermutationImportance(
    model, 
    scoring='accuracy',
    n_iter=5,
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

feature_names = X_val.columns.tolist()
pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False)

#Display permutation importances 
eli5.show_weights(
    permuter,
    top=None, #Shows all features
    feature_names=feature_names
)
                        early_stopping_rounds=100,
                        random_state=42,
                        scale_pos_weight=15,
                        learning_rate=.005,
                        reg_lambda=.01,
                        verbosity=1)
print('fitting...')
model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True)

y_pred_proba = model.predict_proba(X_val)[:, 1]
print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}')

print('permuting...')
permuter = PermutationImportance(model,
                                 cv='prefit',
                                 n_iter=5,
                                 scoring='roc_auc',
                                 random_state=42)
permuter.fit(X_val, y_val)
features_of_import = pd.Series(permuter.feature_importances_,
                               val.columns).sort_values(ascending=True)
print('importance', features_of_import)

print('plotting...')
fig1 = go.Figure()
fig1.add_trace(go.Bar(x=features_of_import, y=val.columns))
py.iplot(fig1, filename='features1')

mask = features_of_import > 0
trimmed_columns = train.columns[mask]
train_trimmed = train[trimmed_columns]
    df.drop(['Name'], axis=1, inplace=True, errors='ignore')
    df.drop(['Cabin'], axis=1, inplace=True, errors='ignore')
    df['Fare'].fillna(value=df['Fare'].mean(), inplace=True)
    fare = np.array(df['Fare'])
    df['Fare'] = normalize([fare]).T
    df.drop(['Fare'], axis=1, inplace=True, errors='ignore')

train_features = train_dataset.drop("Survived", axis=1)
train_labels = train_dataset["Survived"]

test_features = test_dataset

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_features, train_labels)

perm = PermutationImportance(random_forest,
                             random_state=1).fit(train_features, train_labels)
eli5.show_weights(perm, feature_names=train_features.columns.tolist())

Y_pred = random_forest.predict(test_features)

c_matrix = confusion_matrix(train_features, train_labels)
out_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
out_df['PassengerId'] = test_data_ID
out_df['Survived'] = Y_pred

submission_filepath = 'sample_submission.csv'
out_df.to_csv(submission_filepath, index=False)

sns.heatmap(c_matrix.T, square=True, annot=True, fmt='g', cbar=True)
plt.xlabel('true labels')
plt.ylabel('predicted labels')
Example #15
0
    (RobustScaler(), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (SelectKBest(selection_score_func, k=1), ['<NAME3>']),
    (SelectKBest(selection_score_func, k=2), ['<NAME2>', '<NAME3>']),
    (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)),
                   ('p', SelectPercentile(selection_score_func, 30))
                   ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']),
    (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (VarianceThreshold(1.0), ['<NAME2>']),
    (GenericUnivariateSelect(), ['<NAME2>']),
    (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']),
    (SelectFromModel(LogisticRegression(
        'l1', C=0.01, random_state=42)), ['<NAME0>', '<NAME2>']),
    (SelectFromModel(
        PermutationImportance(
            LogisticRegression(random_state=42),
            cv=5,
            random_state=42,
            refit=False,
        ),
        threshold=0.1,
    ), ['<NAME2>', '<NAME3>']),
    (RFE(LogisticRegression(random_state=42), 2), ['<NAME1>', '<NAME3>']),
    (RFECV(LogisticRegression(random_state=42)),
     ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
] + _additional_test_cases)
def test_transform_feature_names_iris(transformer, expected, iris_train):
    X, y, _, _ = iris_train
    transformer.fit(X, y)
    # Test in_names being provided
    res = transform_feature_names(transformer,
                                  ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>'])
    assert res == expected
auc(svm_fpr, svm_tpr)
auc(rf_fpr, rf_tpr)

#Permutation importance

diease_data_train_X_scaled_df = pd.DataFrame(diease_data_train_X_scaled)
temp = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal'
]
diease_data_train_X_scaled_df.columns = temp

import eli5  #for purmutation importance
from eli5.sklearn import PermutationImportance

sgd_perm = PermutationImportance(sgd_predictor, random_state=1).fit(
    diease_data_train_X_scaled, diease_data_train_y)
sgd_importance = eli5.explain_weights(
    sgd_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist())

svm_perm = PermutationImportance(svm_predictor, random_state=1).fit(
    diease_data_train_X_scaled, diease_data_train_y)
svm_importance = eli5.explain_weights(
    svm_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist())

rf_perm = PermutationImportance(rf_predictor,
                                random_state=1).fit(diease_data_train_X_scaled,
                                                    diease_data_train_y)
rf_importance = eli5.explain_weights(
    rf_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist())

#plots the features in descending order of relative importance
Example #17
0
sns.distplot(train_df.revenue, ax=ax[0])
ax[0].set_title("Train Set Revenue Histogram")
sns.distplot(predictions_extra_trees_tuned_test, ax=ax[1])
ax[1].set_title("Test Set Revenue Prediction Histogram")
f.tight_layout()

# ## Feature Selection

# ### Feature Selection with Eli5 for xgboost

# In[ ]:

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(clf_stra_xgb, random_state=42).fit(xtrain, ytrain)

# In[ ]:

eli5.show_weights(perm, feature_names=xvalid.columns.tolist(), top=100)

# In[ ]:

from sklearn.feature_selection import SelectFromModel

max_selected_features = 10
sel = SelectFromModel(perm,
                      max_features=max_selected_features,
                      threshold=0.005,
                      prefit=True)
Example #18
0
import eli5
from eli5.sklearn import PermutationImportance

encoder = GDB_pipeline.named_steps.ordinalencoder
X_train_encoded = encoder.fit_transform(X_train_cut)
X_val_encoded = encoder.transform(X_val_cut)

imputer = GDB_pipeline.named_steps.iterativeimputer
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.fit_transform(X_val_encoded)

model = GDB_pipeline.named_steps.gradientboostingclassifier
# model.fit(X_train_imputed,y_train)

permuter = PermutationImportance(model, scoring='accuracy', n_iter=2)
permuter.fit(X_val_imputed, y_val)
feature_names = X_val_encoded.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

# In[78]:

from pdpbox import pdp

plt.style.use('seaborn-dark-palette')
feature = 'down'
model = GDB_pipeline.named_steps['gradientboostingclassifier']
model_features = X_train_cut.columns
X_train_imputed = pd.DataFrame(X_train_imputed)
X_train_imputed.columns = X_train_cut.columns
pdp_dist = pdp.pdp_isolate(model=model,
    def classification(self, cleaned_Data_frm1, cleaned_Data_frm, y, cursor,
                       conn):
        #         try:
        Modles_reuslts = []
        Names = []
        print("Model building")
        float_cols = self.float_col
        result = pd.concat(
            [cleaned_Data_frm1, cleaned_Data_frm, y, float_cols], axis=1)
        self.data_sorted1 = result.loc[:, ~result.columns.duplicated()]
        self.data_sorted = self.data_sorted1.sort_values(self.i)
        new_list = [
            list(set(self.data_sorted.columns).difference(self.x.columns))
        ]
        X = self.data_sorted.drop([self.i], axis=1)
        y = self.data_sorted[self.i]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.35,
                                                            random_state=42)
        X_train, X_test = train_test(X_train, X_test)
        # List of pipelines for ease of iteration
        l = 0
        access_key_id = self.access_key_id
        secret_access_key = self.secret_access_key
        models = ['Random Forest', 'KNN', 'XGB', 'SVC']
        for classifier, params, model in zip(self.Classifier,
                                             self.Classifiers_grids, models):
            print(classifier)
            l += 1
            gd = RandomizedSearchCV(classifier,
                                    params,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=True,
                                    refit=True)
            gd.fit(X_train, y_train)
            grid = gd.best_params_
            estimator = gd.best_estimator_
            y_pred = gd.predict(X_test)
            cm = confusion_matrix(y_test, y_pred)
            target = self.i
            Accuracy = metrics.accuracy_score(y_test, y_pred)
            print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

            if model == 'KNN':

                perm = PermutationImportance(gd, random_state=1).fit(
                    X_train, y_train)
                importances = perm.feature_importances_
                DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                          importances, grid, estimator, l, cm, target, model)
            elif model == 'SVC':
                importances = gd.best_estimator_.coef_
                imp = importances.tolist()
                importances = imp[0]
                DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                          importances, grid, estimator, l, cm, target, model)
            else:
                importances = gd.best_estimator_.feature_importances_.tolist()
                DB_upload(Accuracy, X_train, X_test, y_test, y_pred,
                          importances, grid, estimator, l, cm, target, model)
#                     encoded_classes = list(self.cleaned_Data_frm)
# model architecture
        if self.types == 'Classification_problem':

            def DNN():
                model = Sequential()
                model.add(
                    Dense(512,
                          input_dim=X_train.shape[1],
                          init='normal',
                          activation='relu'))
                model.add(BatchNormalization())
                model.add(Dropout(0.5))
                model.add(Dense(32, init='normal', activation='relu'))
                model.add(BatchNormalization())
                model.add(Dropout(0.5))
                model.add(Dense(1, init='normal', activation='sigmoid'))
                model.compile(loss='binary_crossentropy',
                              optimizer='adagrad',
                              metrics=['accuracy'])
                return model

            classifier = KerasClassifier(build_fn=DNN, verbose=1)
            batch_size = [10, 20, 40, 60, 80, 100]
            epochs = [10, 50, 100]
            param_grid = dict(batch_size=batch_size, epochs=epochs)
            grid = GridSearchCV(estimator=classifier,
                                param_grid=param_grid,
                                n_jobs=-1,
                                cv=3)
            grid_result = grid.fit(X_train, y_train)
            estimator = grid.best_estimator_
            Accuracy = grid_result.best_score_
            print("%s" % (estimator))
            perm = PermutationImportance(grid,
                                         scoring='accuracy',
                                         random_state=1).fit(X_train, y_train)
            print(perm.feature_importances_)
            DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances,
                      grid, estimator, l, cm, target, model)
            # summarize results
            print("Best: %f using %s" %
                  (grid_result.best_score_, grid_result.best_params_))
        else:
            a = np.unique(self.y)
            a.sort()
            b = a[-1]
            b += 1

            def DNN(dropout_rate=0.0, weight_constraint=0):
                # create model
                model = Sequential()
                model.add(
                    Dense(42,
                          input_dim=X_train.shape[1],
                          kernel_initializer='uniform',
                          activation='relu',
                          kernel_constraint=maxnorm(weight_constraint)))
                model.add(Dropout(dropout_rate))
                model.add(
                    Dense(20, kernel_initializer='uniform', activation='relu'))
                model.add(Dense(b, activation='softmax'))
                model.compile(loss='sparse_categorical_crossentropy',
                              optimizer='adam',
                              metrics=['accuracy'])
                return model

            classifier = KerasClassifier(build_fn=DNN,
                                         epochs=50,
                                         batch_size=10,
                                         verbose=1)
            weight_constraint = [1, 2, 3, 4, 5]
            dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
            param_grid = dict(dropout_rate=dropout_rate,
                              weight_constraint=weight_constraint)
            grid = GridSearchCV(estimator=classifier,
                                param_grid=param_grid,
                                n_jobs=-1,
                                cv=3)
            grid_result = grid.fit(X_train, y_train)
            estimator = grid.best_estimator_
            Accuracy = grid_result.best_score_
            print(Accuracy)
            DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances,
                      grid, estimator, l, cm, target, model)
            print("%s" % (estimator))
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0,
                      n_estimators=14, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

model1.fit(X_train_transformed, y_train)

# Get permutation importances
! pip install eli5
from eli5.sklearn import PermutationImportance
import eli5

permuter = PermutationImportance(
    model1,
    scoring='r2',
    n_iter=2,
    random_state=42
)

permuter.fit(X_val_transformed, y_val)
feature_names = X_val.columns.tolist()

eli5.show_weights(
    permuter,
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

from sklearn.metrics import mean_squared_error, r2_score

# Coefficient of determination r2 for the training set
Example #21
0

if is_labeled_data:
    feature_partial = variables.get("FEATURE_PARTIAL_PLOTS")
    feature_partial_plots = [x.strip() for x in feature_partial.split(',')]
    features_to_plot = variables.get("FEATURE_PARTIAL2D_PLOTS")
    features_to_plot2d = [x.strip() for x in features_to_plot.split(',')]
    shap_row_to_show = int(variables.get("SHAP_ROW_SHOW"))
    columns = [LABEL_COLUMN]
    dataframe_test = dataframe.drop(columns, axis=1, inplace=False)

    dataframe_label = dataframe.filter(columns, axis=1)
    feature_names = dataframe_test.columns.values

    # PERMUTATION IMPORTANCE
    perm = PermutationImportance(loaded_model, random_state=1).fit(
        dataframe_test.values, dataframe_label.values.ravel())
    html_table = eli5.show_weights(
        perm, feature_names=dataframe_test.columns.tolist(), top=50)

    # PARTIAL DEPENDENCE PLOTS
    partial_feature_find = [
        i for i in feature_partial_plots if i in feature_names
    ]

    html_partial_plot = ''
    for i in partial_feature_find:
        pdp_feature = pdp.pdp_isolate(model=loaded_model,
                                      dataset=dataframe_test,
                                      model_features=feature_names,
                                      feature=i)  # preg
        pdp_plot_feature = pdp.pdp_plot(
Example #22
0
        test_size=0.2,
        random_state=times)

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    total_predict = np.zeros(len(y_test))

    for i in range(len(MLA)):

        skf = StratifiedKFold(n_splits=5, random_state=times)

        clf = copy.deepcopy(MLA[i])
        clf.random_state = times
        sel = SelectFromModel(
            PermutationImportance(clf, cv=skf,
                                  random_state=times)).fit(x_train, y_train)
        x_train_trans = sel.transform(x_train)
        x_test_trans = sel.transform(x_test)

        vali_auc = np.mean(
            cross_val_score(clf,
                            x_train_trans,
                            y_train,
                            cv=skf,
                            scoring='roc_auc'))

        clf.fit(x_train_trans, y_train)
        predict_result = clf.predict_proba(x_test_trans)[:, 1]
        total_predict += predict_result

        test_auc = roc_auc_score(y_test, predict_result)
    print('{}----------'.format(j))
    print()
    print(multilabel_confusion_matrix(y_true=y_test, y_pred=target_y_pred[i]))
    print(
        classification_report(y_true=y_test, y_pred=target_y_pred[i],
                              digits=2))
# %%
# Computing feature importance
print('Best MLP estimator: {}'.format(target_clf[0]))
print()
print('Best results')
print(multilabel_confusion_matrix(y_true=y_test, y_pred=target_y_pred[0]))
print(classification_report(y_true=y_test, y_pred=target_y_pred[0], digits=2))

perm = PermutationImportance(estimator=target_clf[0],
                             n_iter=100,
                             random_state=42).fit(X_test, y_test)

# Create a dataframe of the variables and feature importances
feature_importances_df = pd.DataFrame({
    'Variable':
    X.columns,
    'Feature_Importances':
    perm.feature_importances_
})

# Print out the top 3 positive variables
feature_importances_df_sorted = feature_importances_df.sort_values(
    by='Feature_Importances', axis=0, ascending=False)
print()
print(feature_importances_df_sorted)
Example #24
0
ax.set_ylim([0.0, 1.0])
plt.show()


# 7.10 AUC
auc(fpr,tpr)      # 88.71%


# 7.11
#  Find feature importance of any BLACK Box model
#  Refer: https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html
#  See at the end:  How PermutationImportance works?

# 7.11.1 Instantiate the importance object
perm = PermutationImportance(
                            clf,
                            random_state=1
                            )

# 7.11.2 fit data & learn
#        Takes sometime

start = time.time()
perm.fit(X_test, y_test)
end = time.time()
(end - start)/60


# 7.11.3 Conclude: Get feature weights

"""
# If you are using jupyter notebook, use:
Example #25
0
    return model_fe

estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=1, batch_size=1)

#estimator.fit(X, y)
#prediction1 = estimator.predict(X_test_enc)
#accuracy_score(Y_test_enc, prediction1)
keras_callbacks = [
      EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=8)
]
estimator.fit(X_train_enc, y_train_enc,  verbose=2, validation_split=0.10,callbacks=keras_callbacks)

#perm = PermutationImportance(estimator, random_state=1).fit(X_train_enc, y_train_enc)
#eli5.show_weights(perm, feature_names = X_train_enc.columns.tolist())

perm = PermutationImportance(estimator, random_state=1).fit(X_train_enc, y_train_enc)

#kheili tool mikeshe engar kolan eli5 baraye tedad balaye features ha konde!
#
from google.colab import drive
drive.mount('/content/gdrive')

import pickle
with open("/content/gdrive/My Drive/perm.pkl", 'wb') as output:
  pickle.dump(perm, output, pickle.HIGHEST_PROTOCOL)

with open("/content/gdrive/My Drive/estimator.pkl", 'wb') as output:
  pickle.dump(estimator, output, pickle.HIGHEST_PROTOCOL)

eli5.show_weights(perm, feature_names = X_train_enc.columns.tolist())
Example #26
0
# instantiate the tuned random forest
booster_grid_search = GridSearchCV(booster, param_grid, cv=3, n_jobs=-1)

# train the tuned random forest
booster_grid_search.fit(X_train, y_train)

# print best estimator parameters found during the grid search
print(booster_grid_search.best_params_)

best_random = RandomForestRegressor(bootstrap=True,
                                    criterion='mse',
                                    max_depth=30,
                                    max_features='sqrt',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None,
                                    min_samples_leaf=1,
                                    min_samples_split=5,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=1400,
                                    n_jobs=None,
                                    oob_score=False,
                                    random_state=42,
                                    verbose=0,
                                    warm_start=False)
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(best_random.fit(X_train, y_train),
                             random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=list(X_df.columns))
Example #27
0
print2(features_weight.head())

# visualize the top 10 important feature affecting prices
top10 = features_weight[:10].sort_values(by="coefficients")
plt.barh(top10.features, top10.coefficients)
plt.xticks(rotation=45)
plt.axvline(x=0.05, color='red', linestyle='-')
plt.gcf().subplots_adjust(left=0.15)
plt.show()

# construct the data analysis pipeline
xgbpipe = Pipeline([("scaler", StandardScaler()),
                    ("XGBRegressor",
                     best_model.best_estimator_.get_params()['model'])])

xgbpipe.fit(X_train, y_train)

# visualize the top 10 important feature affecting prices using eli5 permutation
perm = PermutationImportance(xgbpipe).fit(X_test, y_test)

_imp_eli5 = dfform(perm.feature_importances_)

eli5_top10 = _imp_eli5.head(10).sort_values(by="coefficients")
plt.barh(eli5_top10.features, eli5_top10.coefficients)
plt.axvline(x=0.05, color='red', linestyle='-')
plt.xlabel("Importance Weight")
plt.title("Airbnb Listing in New York")
plt.gcf().subplots_adjust(left=0.15)
plt.savefig("plot.jpeg")
plt.show()
Example #28
0
def create_model():
    model = Sequential()
    model.add(Dense(100, input_dim=features_number, activation='relu'))
    model.add(Dense(25, activation='relu'))
    if use_binary:
        model.add(Dense(1, activation='sigmoid'))
    else:
        model.add(Dense(Y.shape[1], activation='sigmoid'))
        model.add(Dense(Y.shape[1], activation=activations.softmax))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


create_model()
head = open('/home/nader/workspace/robo/cyrus/script/DataAnalysisPath/feature_import/head').readlines()[0][:-2].split(',')[:features_number]
# create model
train_epoch_number = 20
model = KerasClassifier(build_fn=create_model, epochs=train_epoch_number, batch_size=10, verbose=1)

# if use_new_model:
model.fit(X, Y)
perm = PermutationImportance(model, random_state=1).fit(X,Y)

a = eli5.explain_weights(perm, feature_names=head, top=features_number)
fi = a.feature_importances.importances
nnfeatures = open('feature_import/nn_out_desc.csv', 'w')
for f in fi:
    nnfeatures.write(f.feature + ',' + str(f.weight) + '\n')
    print(f.feature, f.weight, f.std, f.value)
print(len(fi))
Example #29
0
# OK, so it's working well.

# <a id='section4'></a>

# # The Explanation
#
# Now let's see what the model gives us from the ML explainability tools.
#
# **Permutation importance** is the first tool for understanding a machine-learning model, and involves shuffling individual variables in the validation data (after a model has been fit), and seeing the effect on accuracy. Learn more [here](https://www.kaggle.com/dansbecker/permutation-importance).
#
# Let's take a look,
#

# In[ ]:

perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())

# So, it looks like the most important factors in terms of permutation is a thalessemia result of 'reversable defect'. The high importance of 'max heart rate achieved' type makes sense, as this is the immediate, subjective state of the patient at the time of examination (as opposed to, say, age, which is a much more general factor).
#
# Let's take a closer look at the number of major vessles using a **Partial Dependence Plot** (learn more [here](https://www.kaggle.com/dansbecker/partial-plots)). These plots vary a single variable in a single row across a range of values and see what effect it has on the outcome. It does this for several rows and plots the average effect. Let's take a look at the 'num_major_vessels' variable, which was at the top of the permutation importance list,

# In[ ]:

base_features = dt.columns.values.tolist()
base_features.remove('target')

feat_name = 'num_major_vessels'
pdp_dist = pdp.pdp_isolate(model=model,
                           dataset=X_test,
                           model_features=base_features,
Example #30
0
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}

grid_search = GridSearchCV(SVC(), params, cv=5)

grid_search.fit(train, target)

grid_search.best_params_
import eli5
from eli5.sklearn import PermutationImportance

importance_model = SVC(C=10, gamma=0.1, probability=True)
importance_model.fit(train, target)

perm = PermutationImportance(importance_model, random_state=42).fit(test_data, test_target)
eli5.show_weights(perm, feature_names=test_data.columns.tolist())
import shap

data_for_prediction = test_data.iloc[0]

k_explainer = shap.KernelExplainer(importance_model.predict_proba, train_data)
k_shap_values = k_explainer.shap_values(data_for_prediction)

shap.initjs()
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)
test_target.iloc[0], target_string.iloc[0]
model = SVC(C=10, gamma=0.1)
model.fit(train, target_string)
predictions = model.predict(test)
predictions[:10]