ens.score(X_val, y_val)
#Regression
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeClassifier
ens = BaggingRegressor(DecisionTreeRegressor(random_state=101))
ens.fit(X_train, y_train)
ens.score(X_val, y_val)

#Once you are confident about your final model, measure its performance on the test set to estimate the generalization error

#Model interpretability
#Feature importance
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model, random_state=101).fit(X_val, y_val)
eli5.show_weights(perm, feature_names=X_val.columns.tolist())

#Partial dependence plot
#New integration in sklearn, might not work with older versions
from sklearn.inspection import partial_dependence, plot_partial_dependence
partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')])
plot_partial_dependence(model,
                        X_train,
                        features=['feature', ('feat1', 'feat2')])
#With external module for legacy editions
from pdpbox import pdp, get_dataset, info_plots

#Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=model,
                            dataset=X_val,
Example #2
0
                   ('p', SelectPercentile(selection_score_func, 30))
                   ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']),
    (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (VarianceThreshold(1.0), ['<NAME2>']),
    (GenericUnivariateSelect(), ['<NAME2>']),
    (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']),
    (SelectFromModel(
        LogisticRegression('l1',
                           C=0.01,
                           solver='liblinear',
                           random_state=42,
                           multi_class='ovr')), ['<NAME0>', '<NAME2>']),
    (SelectFromModel(
        PermutationImportance(
            LogisticRegression(solver='liblinear', random_state=42),
            cv=5,
            random_state=42,
            refit=False,
        ),
        threshold=0.1,
    ), ['<NAME2>', '<NAME3>']),
    (RFE(
        LogisticRegression(solver='liblinear',
                           random_state=42,
                           multi_class='ovr'), 2), ['<NAME1>', '<NAME3>']),
    (RFECV(LogisticRegression(
        solver='liblinear', random_state=42, multi_class='ovr'),
           cv=3), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
] + _additional_test_cases)
def test_transform_feature_names_iris(transformer, expected, iris_train):
    X, y, _, _ = iris_train
    transformer.fit(X, y)
Example #3
0
def sk_process(df_train,
               param,
               message,
               df_test=None,
               trial=None,
               is_output_feature_importance=False,
               trial_level=0):
    """
    >>>param = {
    >>>    'columns': columns,
    >>>    'cv': {
    >>>        'cls': 'KFold',
    >>>        'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}
    >>>    },
    >>>    'scaler': {
    >>>        'cls': 'StandardScaler', 'init': {}, 'fit': {}
    >>>    },>>>
    >>>    'model': {
    >>>        'cls': 'lgb.LGBMRegressor',
    >>>        'init': {
    >>>            'learning_rate': 0.35395923077843333,
    >>>             'feature_fraction': 0.8840483697334669,
    >>>            'bagging_fraction': 0.7017457378676857,
    >>>            'min_data_in_leaf': 616,
    >>>            'lambda_l1': 0.00013058988949929333,
    >>>            'lambda_l2': 0.004991992636437704,
    >>>            'max_bin': 74,
    >>>            'num_leaves': 64,
    >>>            'random_state': 2928,
    >>>            'n_jobs': 16
    >>>        },
    >>>        'fit': {}
    >>>    },
    >>>    'metric': 'mean_absolute_error'
    >>>}
    :param df_train:
    :param param:
    :param message:
    :param df_test:
    :param trial:
    :param is_output_feature_importance:
    :param trial_level:
    :return:
    """

    columns = param['columns']

    assert 'y' in df_train.columns.tolist(), 'y is not in df_train'
    assert 'index' in df_train.columns.tolist(), 'index is not in df_train'
    assert 'index' not in param['columns'], 'index is in features'
    assert 'y' not in param['columns'], 'y is in features'
    assert 'label' not in param['columns'], 'label is in features'
    assert 'group' not in param['columns'], 'group is in features'
    assert (type(trial) == list) | (trial
                                    == None), 'trial is neither list nor none'
    assert len(columns) != 0, 'columns size is 0'

    df_test_pred = None
    if type(df_test) == pd.DataFrame:
        assert 'index' in df_test.columns.tolist(), 'index is not in df_test'
        df_test_pred = pd.concat([df_test_pred, df_test[['index']]], axis=1)

    CV = processutil._str2class(param['cv']['cls'])
    MODEL = processutil._str2class(param['model']['cls'])
    if 'scaler' in param:
        SCALER = processutil._str2class(param['scaler']['cls'])
    metric = processutil._str2class(param['metric'])

    history = []
    df_valid_pred = pd.DataFrame()
    df_feature_importances_i_list = []

    # StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
    if 'splits' in param['cv']:
        splits = param['cv']['splits']
    else:
        cv = CV(**param['cv']['init'])
        if param['cv']['cls'] == 'StratifiedKFold':
            assert 'label' in df_train.columns.tolist(
            ), 'label is not in df_train'
            splits = list(cv.split(df_train, df_train['label']))
        elif param['cv']['cls'] == 'GroupKFold':
            assert 'group' in df_train.columns.tolist(
            ), 'group is not in df_train'
            splits = list(cv.split(df_train, groups=df_train['group']))
        else:
            splits = list(cv.split(df_train))

    for fold_n, (train_index, valid_index) in enumerate(splits):

        X_train, X_valid = df_train[columns].values[
            train_index, :], df_train[columns].values[valid_index, :]
        y_train, y_valid = df_train['y'].values[train_index], df_train[
            'y'].values[valid_index]

        if 'scaler' in param:
            scaler = SCALER(**param['scaler']['init'])
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.transform(X_valid)

        model = MODEL(**param['model']['init'])
        model.fit(X_train, y_train, **param['model']['fit'])

        y_valid_pred = model.predict(X_valid)
        y_train_pred = model.predict(X_train)

        original_index = df_train['index'].values[valid_index]
        df_valid_pred_i = pd.DataFrame \
            ({'index': original_index, 'predict': y_valid_pred, 'fold_n': np.zeros(y_valid_pred.shape[0]) + fold_n})
        df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0)

        if is_output_feature_importance:
            df_feature_importances_i = pd.DataFrame({
                'feature':
                columns,
                'model_weight':
                model.feature_importances_
            })
            df_feature_importances_i = df_feature_importances_i.sort_values(
                by=['feature'])
            df_feature_importances_i = df_feature_importances_i.reset_index(
                drop=True)
            perm = PermutationImportance(model, random_state=42).fit(
                X_valid, y_valid)
            df_feature_importances_i2 = eli5.explain_weights_dfs(
                perm, feature_names=columns,
                top=len(columns))['feature_importances']
            df_feature_importances_i2 = df_feature_importances_i2.sort_values(
                by=['feature'])
            df_feature_importances_i2 = df_feature_importances_i2.reset_index(
                drop=True)
            df_feature_importances_i = pd.merge(df_feature_importances_i,
                                                df_feature_importances_i2,
                                                on='feature')
            df_feature_importances_i_list.append(df_feature_importances_i)

        if type(df_test) == pd.DataFrame:
            X_test = df_test[columns].values
            if 'scaler' in param:
                X_test = scaler.transform(X_test)
            y_test_pred = model.predict(X_test)
            df_test_pred_i = pd.DataFrame({fold_n: y_test_pred})
            df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=1)

        history.append \
            ({'fold_n': fold_n, 'train': metric(y_train, y_train_pred, group=df_train['group']), 'valid': metric(y_valid, y_valid_pred, group=df_train['group'])})

    df_his = pd.DataFrame(history)

    df_feature_importances = None
    if is_output_feature_importance:
        df_feature_importances = df_feature_importances_i_list[0]
        for idx, df_feature_importances_i in enumerate(
                df_feature_importances_i_list[1:]):
            df_feature_importances = pd.merge(df_feature_importances,
                                              df_feature_importances_i,
                                              on='feature',
                                              suffixes=('', idx + 1))

    df_valid_pred = df_valid_pred.sort_values(by=['index'])
    df_valid_pred = df_valid_pred.reset_index(drop=True)

    if type(df_test) == pd.DataFrame:
        df_test_pred = df_test_pred.sort_values(by=['index'])
        df_test_pred = df_test_pred.reset_index(drop=True)

    if type(trial) == list:
        datetime_ = datetime.datetime.now()
        val_metric_mean = np.mean(df_his.valid)
        val_metric_std = np.std(df_his.valid)
        train_metric_mean = np.mean(df_his.train)
        train_metric_std = np.std(df_his.train)

        trial_i_d_ = {
            'datetime': datetime_,
            'message': message,
            'val_metric_mean': val_metric_mean,
            'train_metric_mean': train_metric_mean,
            'val_metric_std': val_metric_std,
            'train_metric_std': train_metric_std,
            'trn_val_metric_diff': val_metric_mean - train_metric_mean,
            'df_feature_importances': df_feature_importances,
            'param': param.copy(),
            'nfeatures': len(columns)
        }
        if trial_level > 0:
            trial_i_d_ = {
                'df_his': df_his,
                'df_valid_pred': df_valid_pred,
                'df_test_pred': df_test_pred,
                **trial_i_d_
            }
        trial.append(trial_i_d_)

    return df_his, df_feature_importances, df_valid_pred, df_test_pred
Example #4
0
X = df.drop(columns=['target', 'ID_code'])
y = df.target

# In[6]:

#split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

# In[19]:

#importance of the feature
clf = RandomForestClassifier(random_state=0, n_jobs=1).fit(X_train, y_train)
perm = PermutationImportance(clf, random_state=1).fit(X_test, y_test)
weight = eli5.show_weights(perm,
                           feature_names=X_test.columns.tolist(),
                           top=100)

# In[44]:

weight

# In[119]:

#select feature
df_train = df.loc[:199999, [
    'var_81', 'var_26', 'var_44', 'var_110', 'var_109', 'var_190', 'var_78',
    'var_21', 'var_1', 'var_99', 'var_133', 'var_166', 'var_34', 'var_148',
    'var_122', 'var_139', 'var_164', 'var_12', 'var_165', 'var_119', 'var_76',
# - it has a good mean score
# - it has a low variance

# ### Fine-tune the model

# This part will be implemented soon

# I was reading about how to select good feature from [here](https://www.kaggle.com/dansbecker/permutation-importance?utm_medium=email&utm_source=mailchimp&utm_campaign=ml4insights) so I decided to try it now that I can't add features on myself, so let's do it.

# In[ ]:

import eli5
from eli5.sklearn import PermutationImportance

log_reg.fit(learning_data, labels)
perm_imp = PermutationImportance(log_reg,
                                 random_state=1).fit(learning_data, labels)
eli5.show_weights(perm_imp, feature_names=COLUMNS)

# The features are ordered by impact on the model, so the Sex feature has the biggest impact on our model.
#
# I repeated this process many time and tried to combine features to end up with adding is_child_and_sex feature.

# ### Run on test data

# In[ ]:

test_set = pd.read_csv("../input/test.csv")
pred = pipeline.fit_transform(test_set)

# In[ ]:
y_expert_stand = scaler.transform(y_expert)

LR = LinearRegression()
lr = LR.fit(x_train_stand, y_train_stand)
y_pred = lr.predict(x_test_stand)

m1 = mean_squared_error(y_test_stand, y_pred)
m2 = mean_squared_error(y_test_stand, y_expert_stand)
print('Linear Regression Model\'s MSE is', m1)
print('Expert Guess\'s MSE is', m2)


# In[51]:


perm = PermutationImportance(lr, random_state=i_min).fit(x_test_stand, y_test_stand)
eli5.show_weights(perm, feature_names = x_test.columns.tolist(), top=50)


# ### SGD Model

# In[52]:


MSE_vec_sgd = np.zeros(N_bs)


# In[53]:


for bs_ind in range(N_bs):
Example #7
0
data1_x_bin[:10].to_csv("testing_data.csv")

list(data1_x_bin[:10].columns)

str(lr.predict(data1_x_bin[:10]))

"""# Model Interpretation"""

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree.predict(X_test)

"""## Eli5"""

perm = PermutationImportance(dtree , random_state=101).fit(X_test, y_test)      # Evaluate the permutation importance 
eli5.show_weights(perm, feature_names = X_test.columns.values)

"""## Shap"""

row_to_show = 7                                                                # The row for which we want to check the SHAP explanations
data_to_predict = X_test.iloc[row_to_show]

data_to_preddict_array = data_to_predict.values.reshape(1,-1)

dtree_pred = dtree.predict_proba(data_to_preddict_array)  

dtree.predict(data_to_preddict_array)

# Object that can calculate Shap values
explainer = shap.TreeExplainer(dtree)                                       # SHAP Tree Explainer
Example #8
0
all_dataset = dataset.shuffle(len(df)).batch(1)
test_dataset = all_dataset.take(500)
train_dataset = all_dataset.skip(500)


def get_compiled_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(4, activation='sigmoid'),
        tf.keras.layers.Dense(10, activation='sigmoid'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='mean_squared_error',
                  metrics=['mean_squared_error'])
    return model


model = get_compiled_model()
model.fit(train_dataset, epochs=2)

test_loss, test_acc = model.evaluate(test_dataset, verbose=2)

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model,
                             random_state=1,
                             scoring="neg_mean_squared_error").fit(
                                 X, target.values)
print(eli5.explain_weights(perm, activities))
Example #9
0
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("True positives: {}\nFalse positives: {}".format(cm[0, 0], cm[0, 1]))
print("True negatives: {}\nFalse negatives: {}".format(cm[1, 1], cm[1, 0]))

# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm,
                         columns=['Actual Positive:1', 'Actual Negative:0'],
                         index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

X_list = X_test.columns.tolist()

clf = xgb.XGBClassifier(n_estimators=150, random_state=2020)
clf.fit(X_train, y_train)
perm = PermutationImportance(clf, random_state=2010)
perm.fit(X_test, y_test)
# Store feature weights in an object
html_obj = eli5.show_weights(perm, feature_names=X_list)
# Write html object to a file (adjust file path; Windows path is used here)
with open(
        r'C:\Users\lukem\Desktop\Github AI Projects\Higgs-Boson-machine-learning-challenge\boson-importance.htm',
        'wb') as f:
    f.write(html_obj.data.encode("UTF-8"))

lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print(f"logistic regression, mae: {mae}")
vals = ax.get_yticks()
ax.set_ylim(0.2875, 0.2925)
ax.set_yticklabels(["{:,.2%}".format(i) for i in vals])
plt.show()
##########################################
n = 520
rf = RandomForestRegressor(n_estimators=n,
                           random_state=0,
                           n_jobs=multiprocessing.cpu_count(),
                           bootstrap=False)
rf.fit(x, y)
#SKLean uses Gini Importance by default
GI = pd.DataFrame(data=[tree.feature_importances_ for tree in rf.estimators_],
                  columns=x.columns)
### Using eli5 to compute permutation accuracy importance on fitted random forest
perm = PermutationImportance(rf, cv="prefit",
                             n_iter=10).fit(x.values, y.values)
# Permutation Accuracy Importance
PI = pd.DataFrame(data=perm.results_, columns=x.columns)
#Rename columns to conform to formulae used in paper
formula = {
    'considered-farm-plots': "$S$",
    'compare_quality': '$F_{Qual}$',
    'compare_distance': '$F_{Dist}$',
    'homophily_age': '$F_{HAge}$',
    'desire_migration': '$F_{Mig}$',
    'compare_yeild': '$F_{Yield}$',
    'homophily_agricultural_productivity': '$F_{HAgri}$',
    'compare_dryness': '$F_{Dry}$',
    'compare_water_availability': '$F_{Water}$',
    'desire_social_presence': '$F_{Soc}$'
}
    def classification(self,cleaned_Data_frm1, cleaned_Data_frm,y,cursor ,conn):
        try:
            Modles_reuslts =[]
            Names = []
            print("Model building")
            float_cols = self.float_col
            result = pd.concat([cleaned_Data_frm1,cleaned_Data_frm,y,float_cols], axis=1)
            self.data_sorted1 = result.loc[:,~result.columns.duplicated()]
            self.data_sorted2 =  self.data_sorted1.sort_values(self.i)
            self.data_sorted = self.data_sorted2.dropna(thresh=self.data_sorted2.shape[0]*0.5,how='all',axis=1)
            self.data_sorted  = self.data_sorted.dropna()
            new_list = [list(set(self.data_sorted.columns).difference(self.x.columns))]
            X = self.data_sorted.drop([self.i],axis=1)
            print(X.shape)
            Y = self.data_sorted[self.i]
            print(Y.unique())
            X= X.fillna(X.mean())
            y = (', '.join(["%s" % self.i]))
            print(y)
            cols = list(self.data_sorted.columns)
            x = cols
            x.remove(y)
#             List of pipelines for ease of iteration
            l = 0
            access_key_id = self.access_key_id 
            secret_access_key = self.secret_access_key
            models = ['Random Forest','KNN','XGB','SVC']
            if  'sklearn'== (', '.join(["%s" %  self.sklearn])):
                print("good")
                def sklearn(X,Y,algos):
                    model = models[algos]
                    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42)
                    X_train, X_test = train_test(X_train, X_test)
                    gd = RandomizedSearchCV(self.Classifier[algos],self.Classifiers_grids[algos],cv = 5, n_jobs=-1,
                                            verbose=True,refit = True)
                    gd.fit(X_train, y_train)
                    grid = gd.best_params_
                    estimator = gd.best_estimator_
                    y_pred=gd.predict(X_test)
                    cm =confusion_matrix(y_test, y_pred)
                    target = self.i
                    Accuracy = metrics.accuracy_score(y_test, y_pred)
                    print(cm)
                    print(grid)
                    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
                    if model=='KNN':
                        perm = PermutationImportance(gd, random_state=1).fit(X_train,y_train)      
                        importances = perm.feature_importances_
                        DB_upload(Accuracy,X_train,X_test,y_test,y_pred,
                                       importances,grid,estimator,l,cm,target,model)
                    elif model == 'SVC':
                        importances = gd.best_estimator_.coef_
                        imp = importances.tolist()
                        importances = imp[0]
                        DB_upload(Accuracy,X_train,X_test,y_test,y_pred, 
                                      importances,grid,estimator,l,cm,target,model)
                    else:
                        importances = gd.best_estimator_.feature_importances_.tolist()
                        #create a feature list from the original dataset (list of columns)
                        # What are this numbers? Let's get back to the columns of the original dataset
                        feature_list = list(X_train.columns)
                        #create a list of tuples
                        feature_importance= sorted(zip(importances, feature_list), reverse=True)
                        DB_upload(Accuracy,X_train,X_test,y_test,y_pred, 
                                      importances,grid,estimator,l,cm,target,model)
                    return Accuracy
                sklearn(X,Y,algos)
            elif 'ai' == (', '.join(["%s" %  self.ai])):
                print('H2o')
                def H2o(x,y,X,Y):
                    df = h2o.H2OFrame(self.data_sorted)
                    train,  test = df.split_frame(ratios=[.8])
                    train[y] = train[y].asfactor()
                    test[y] = test[y].asfactor()
                    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42)
                    print(X_train.shape)
                    # Run AutoML for 20 base models (limited to 1 hour max runtime by default)
                    aml = H2OAutoML(max_models=5, seed=1)
                    aml.train(x=x, y=y, training_frame=train)
                    # View the AutoML Leaderboard
                    lb = aml.leaderboard
                    print(lb.head(rows=lb.nrows))
                    m = h2o.get_model(lb[2,"model_id"])
                    data_as_list = h2o.as_list(m, use_pandas=False)
                    return lb
                H2o(x,y ,X,Y)
            else:
                    print('Dnn')
                    if self.types == 'Classification_problem':
                        def DNN():
                            model = Sequential()
                            model.add(Dense(512, input_dim=X_train.shape[1], init='normal', activation='relu'))
                            model.add(BatchNormalization())
                            model.add(Dropout(0.5))
                            model.add(Dense(32, init='normal', activation='relu'))
                            model.add(BatchNormalization())
                            model.add(Dropout(0.5))
                            model.add(Dense(1, init='normal', activation='sigmoid'))
                            model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])
                            return model
                        X = self.data_sorted.drop([self.i],axis=1)
                        Y = self.data_sorted[self.i]
                        X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42)
                        X_train, X_test = train_test(X_train, X_test)
                        classifier = KerasClassifier(build_fn=DNN, verbose=1)
                        batch_size = [10 ,20, 40, 60, 80, 100]
                        epochs = [10, 50, 100]
                        param_grid = dict(batch_size=batch_size, epochs=epochs)
                        grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3)
                        grid_result = grid.fit(X_train, y_train)
                        estimator = grid.best_estimator_
                        Accuracy= grid_result.best_score_
                        print("%s" % (estimator))
                        y_pred=grid.predict(X_test)
                        perm = PermutationImportance(grid, scoring='accuracy', random_state=1).fit(X_train,y_train)      
                        print(perm.feature_importances_)
                        DB_upload(Accuracy,X_train,X_test,y_test, None,importances,grid,estimator,l,
                                      cm,target,model)
                        # summarize results
                        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
                    else:
                        a = np.unique(self.y)
                        a.sort()
                        b=a[-1]
                        b +=1
                        def DNN(dropout_rate=0.0, weight_constraint=0):
                            # create model
                            model = Sequential()
                            model.add(Dense(42, input_dim=X_train.shape[1], kernel_initializer='uniform', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
                            model.add(Dropout(dropout_rate))
                            model.add(Dense(20, kernel_initializer='uniform', activation='relu'))
                            model.add(Dense(b,activation='softmax'))
                            model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
                            return model
                        classifier = KerasClassifier(build_fn=DNN, epochs=10, batch_size=10, verbose=1)
                        weight_constraint = [1] #2, 3, 4, 5]
                        dropout_rate = [0.0]#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
                        param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
                        grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3)
                        grid_result = grid.fit(X_train, y_train)
                        estimator = grid.best_estimator_
                        Accuracy= grid_result.best_score_
                        y_pred=grid.predict(X_test)
                        print(y_pred)
                        DB_upload(Accuracy,X_train,X_test,y_test,None,importances,grid,estimator,l,
                                  cm,target,model)
                        print("%s" % (estimator))
        except:
            print('Regression model building failed')
def test_estimator_type():
    perm = PermutationImportance(LogisticRegression(), cv=3)
    assert is_classifier(perm)

    perm = PermutationImportance(RandomForestRegressor(), cv=3)
    assert is_regressor(perm)
def test_invalid_params():
    with pytest.raises(ValueError):
        reg = PermutationImportance(SVR(), cv="hello")
Example #14
0
def main_run_linear_models(train_ds,
                           val_ds,
                           test_ds,
                           data_props,
                           max_backlooking=None,
                           layer_type='dense',
                           activation_funcs=['sigmoid', 'relu', 'tanh'],
                           max_serach_iterations=200,
                           NN_max_depth=3,
                           MAX_EPOCHS=800,
                           patience=25,
                           model_name='linear',
                           examples=None,
                           return_permutation_importances=True,
                           redo_serach_best_model=False):
    mlflow.set_experiment(model_name)
    experiment_date_time = int(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    flatten_input = True if layer_type == 'dense' else False

    def _extract_just_important_data_props(data_props):
        kwargs = {}
        kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][
            'cols_just_these']
        kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][
            'cols_drop']
        kwargs['dataset_cols_y'] = data_props['third_filter'][
            'y_cols_just_these']
        kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset'])
        kwargs['dataset_hash_first'] = data_props['first_step_data_hash']
        kwargs['dataset_hash_second'] = data_props['second_step_data_hash']
        kwargs['dataset_split_method'] = data_props['second_step'][
            'split_method']
        kwargs['dataset_split_steps_train'] = data_props['second_step'][
            'split_props']['train_time_steps']
        kwargs['dataset_split_steps_val'] = data_props['second_step'][
            'split_props']['val_time_steps']
        kwargs['dataset_split_steps_test'] = data_props['second_step'][
            'split_props']['test_time_steps']
        kwargs['dataset_iter_step'] = data_props['iter_step']
        kwargs['dataset_normalization'] = data_props['second_step'][
            'normalize_method']
        kwargs['dataset_window_backlooking'] = data_props['first_step'][
            'window_input_width']
        kwargs['dataset_window_prediction'] = data_props['first_step'][
            'window_pred_width']
        kwargs['dataset_window_shift'] = data_props['first_step'][
            'window_shift']
        return kwargs

    def _hp_tranform_param_dict(param_dict):
        new_param_dict = {}
        for key, value in param_dict.items():
            if type(value) == list:
                new_param_dict[key] = hp.choice(key, value)
            elif type(value) == set:
                new_param_dict[key] = hp.uniform(key, *values)
            else:
                new_param_dict[key] = value
        return new_param_dict

    max_backlooking = data_props['first_step'][
        'window_input_width'] if max_backlooking is None else max_backlooking

    param_grid = dict(
        n_layers=list(range(1, NN_max_depth + 1)),
        first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8],
        last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4],
        activation_func=activation_funcs,
        backlooking_window=list(range(1, max_backlooking + 1)))
    hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid)
    hp_param_dict['model_name'] = model_name
    hp_param_dict['data_props'] = data_props
    hp_param_dict['layer_type'] = layer_type

    def _optimize_objective(*args, **kwargs):
        if args != ():
            kwargs = args[
                0]  # if positional arguments expect first to be dictionary with all kwargs
        if type(kwargs) != dict:
            raise Exception(
                f'kwargs is not  dict - it is {type(kwargs)} with values: {kwargs}'
            )

        backlooking_window = kwargs.pop('backlooking_window')
        n_layers = kwargs.pop('n_layers')
        first_layer_nodes = kwargs.pop('first_layer_nodes')
        last_layer_nodes = kwargs.pop('last_layer_nodes')
        activation_func = kwargs.pop('activation_func')
        return_everything = kwargs.pop('return_everything', False)
        verbose = kwargs.pop('verbose', 0)
        model_name = kwargs.pop('model_name', 'linear')
        data_props = kwargs.pop('data_props')
        layer_type = kwargs.pop('layer_type', 'dense')

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=flatten_input,
                                 keep_last_n_periods=backlooking_window)

        now = datetime.datetime.now()
        date_time = str(now.strftime("%y%m%d%H%M%S"))
        model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}"

        kwargs = dict(
            model_name=model_name,
            n_layers=n_layers,
            first_layer_nodes=first_layer_nodes,
            last_layer_nodes=last_layer_nodes,
            activation_func=activation_func,
            input_size=dataset['input_shape'] if layer_type == 'dense' else
            tuple(list(train_ds.element_spec[0].shape)[1:]),
            output_size=dataset['output_shape'],
            backlooking_window=backlooking_window,
            layer_type=layer_type)

        model = createmodel(**kwargs)
        history, mlflow_additional_params = compile_and_fit(
            model=model,
            train=dataset['train_ds'],
            val=dataset['val_ds'],
            MAX_EPOCHS=MAX_EPOCHS,
            patience=patience,
            model_name=model_name,
            verbose=verbose)

        # Get all data props for documentation in MLflow
        kwargs.update(_extract_just_important_data_props(data_props))
        kwargs['run'] = experiment_date_time
        mlflow_additional_params['kwargs'] = kwargs

        train_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['train_ds'])))
        val_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['val_ds'])))
        test_performance = dict(
            zip(
                model.metrics_names,
                evaluate_model(
                    model=model,
                    tf_data=dataset['test_ds'],
                    mlflow_additional_params=mlflow_additional_params)))
        mlflow_additional_params['data_props'] = data_props

        # Only save model if close to 15% best models
        try:
            best_loss = float(trials.best_trial['result']['loss'])
            current_loss = min(history.history['val_loss'])
            if current_loss <= best_loss * (1 + 0.15):
                save_model = True
            else:
                save_model = False
        except:
            save_model = True
        mlflow_saved = my_helpers.mlflow_last_run_add_param(
            param_dict=mlflow_additional_params, save_model=save_model)

        tf.keras.backend.clear_session()

        return_metrics = dict(loss=val_performance['loss'],
                              all_metrics={
                                  'train': train_performance,
                                  'val': val_performance,
                                  'test': test_performance
                              },
                              status=STATUS_OK,
                              mlflow=mlflow_saved,
                              model_name=model_name)

        if return_everything:
            return_metrics['model'] = model
            return_metrics['history'] = history

        return return_metrics

    ###### Get old best model records ######

    storage_file_path = os.path.join(
        my_helpers.get_project_directories(key='cache_dir'),
        'storage_best_model.json')
    if not os.path.exists(storage_file_path):
        best_model_storage = {}
    else:
        with open(storage_file_path) as json_file:
            best_model_storage = json.load(json_file)

    ######## Search for best model ########

    if redo_serach_best_model or model_name not in best_model_storage or data_props[
            'iter_step'] not in best_model_storage[model_name]:
        warnings.filterwarnings('ignore')
        trials = Trials()
        best = fmin(fn=_optimize_objective,
                    space=hp_param_dict,
                    algo=tpe.suggest,
                    max_evals=max_serach_iterations,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=int(
                        max_serach_iterations / 4),
                                                   percent_increase=0.025))
        warnings.simplefilter('always')

        # getting all parameters for best model storage
        mlflow_best_model = trials.best_trial['result']['mlflow']
        best_params = {}
        for key, idx in best.items():
            best_params[key] = param_grid[key][idx]

        coef_names_ = list(
            data_props['look_ups']['out_lookup_col_name']['X'].keys())
        coef_names_ = coef_names_ + [
            col + f'_sft_{i}'
            for i in range(1, best_params['backlooking_window'])
            for col in coef_names_
        ]

        # Saving best model to storage
        if model_name not in best_model_storage:
            best_model_storage[model_name] = {}
        if data_props['iter_step'] not in best_model_storage[model_name]:
            best_model_storage[model_name][data_props['iter_step']] = {
                'best_model': {
                    'result': {
                        'loss': 10**10
                    }
                },
                'history': {}
            }

        best_model_param = dict(
            result={
                'loss': trials.best_trial['result']['loss'],
                'all_metrics': trials.best_trial['result']['all_metrics']
            },
            model_name=trials.best_trial['result']['model_name'],
            model_id=trials.best_trial['result']['mlflow']['model_id'],
            run_id=experiment_date_time,
            input_coefs=coef_names_,
            path_saved_model=trials.best_trial['result']['mlflow']
            ['saved_model_path'],
            status=trials.best_trial['result']['status'],
            params=best_params,
            data=_extract_just_important_data_props(data_props))

        best_model_storage[model_name][data_props['iter_step']]['history'][
            experiment_date_time] = best_model_param
        if trials.best_trial['result']['loss'] < best_model_storage[model_name][
                data_props['iter_step']]['best_model']['result']['loss']:
            best_model_storage[model_name][
                data_props['iter_step']]['best_model'] = best_model_param

        with open(storage_file_path, 'w') as outfile:
            json.dump(best_model_storage, outfile)

    else:
        # Get best model from storage
        best_model_param = best_model_storage[model_name][
            data_props['iter_step']]['best_model']

    ######## Get Best model again ########
    best_model = tf.keras.models.load_model(
        best_model_param['path_saved_model'])
    best_model.compile(loss=tf.losses.MeanAbsoluteError(),
                       optimizer=tf.optimizers.Adam(),
                       metrics=[
                           tf.metrics.MeanAbsoluteError(),
                           CustomMeanDirectionalAccuracy(),
                           tf.losses.Huber(),
                           tf.metrics.MeanAbsolutePercentageError(),
                           tf.metrics.MeanSquaredError(),
                           tf.metrics.MeanSquaredLogarithmicError()
                       ])
    print('Best model is:', best_model_param)

    out = dict(best_model_param)

    ####### Get examples for plotting #######
    if examples is not None:
        example_X = examples['X']
        periods = best_model_param['params']['backlooking_window']
        if layer_type == 'dense':
            example_X = tf.data.Dataset.from_tensors(
                np.reshape(example_X[:, -periods:, :],
                           (example_X.shape[0], -1)))
        else:
            example_X = tf.data.Dataset.from_tensors(example_X)
        out['examples_pred_y'] = best_model.predict(example_X)

    ###### For 1 layer dense/linear models get coef & p-values ######
    if NN_max_depth == 1 and isinstance(best_model.layers[0],
                                        tf.keras.layers.Dense):
        # Get coefs
        intercept_ = best_model.layers[0].bias.numpy()
        coef_ = best_model.layers[0].weights[0].numpy()
        out['coef_'] = pd.Series(
            dict(
                zip(['intercept_'] + best_model_param['input_coefs'],
                    intercept_.tolist() + coef_.squeeze().tolist())))

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=True,
                                 keep_last_n_periods=best_model_param['params']
                                 ['backlooking_window'])

        # get p-values
        import app.d_prediction.my_custom_pvalue_calc as my_p_lib

        out['p_values'] = {}
        for data_set in ['train', 'val', 'test']:
            y_pred = best_model.predict(dataset[f'{data_set}_X'])
            y_pred = np.reshape(y_pred, (-1, 1))
            try:
                p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'],
                                              dataset[f'{data_set}_y'], coef_,
                                              intercept_, y_pred)
                p_values = pd.Series(
                    dict(zip(best_model_param['input_coefs'], p_values)))
                out['p_values'][data_set] = p_values
            except:
                warnings.warn(
                    "P-Values: ValueError: Input contains infinity or nan.")
                out['p_values'][data_set] = pd.Series(
                    dict(
                        zip(best_model_param['input_coefs'],
                            ['error'] * len(best_model_param['input_coefs']))))
        out['p_values'] = pd.DataFrame(out['p_values'])

    ##### Get Column Feature Importance #####
    if return_permutation_importances:
        if 'feature_importance' in best_model_param:
            out['feature_importance'] = best_model_param['feature_importance']

        else:
            import eli5
            from eli5.sklearn import PermutationImportance

            sklearn_model = KerasRegressor(build_fn=best_model)
            sklearn_model.model = best_model

            dataset = _get_prep_data(
                train_ds,
                val_ds,
                test_ds,
                flatten=flatten_input,
                keep_last_n_periods=best_model_param['params']
                ['backlooking_window'])

            out['feature_importance'] = {}
            for data_set in ['train', 'val']:
                # Calculate actual FeatureImporttance
                try:
                    perm = PermutationImportance(
                        sklearn_model, cv='prefit').fit(
                            dataset[f'{data_set}_X'].numpy(),
                            np.reshape(dataset[f'{data_set}_y'].numpy(),
                                       (-1, 1)))
                    feature_importances = eli5.format_as_dataframe(
                        eli5.explain_weights(
                            perm,
                            feature_names=best_model_param['input_coefs'],
                            top=10**10))
                    out['feature_importance'][
                        data_set] = feature_importances.set_index(
                            'feature').to_dict()
                except:
                    warnings.warn(
                        "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')."
                    )

            if out['feature_importance'] != {}:
                best_model_param['feature_importance'] = out[
                    'feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['best_model'][
                        'feature_importance'] = out['feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['history'][experiment_date_time][
                        'feature_importance'] = out['feature_importance']

                with open(storage_file_path, 'w') as outfile:
                    json.dump(best_model_storage, outfile)

    out['status'] = 'ok'
    return out
X = df.iloc[:, 3:194]
Y_tmp = df.iloc[:, 0]
Y = []

total_sents = len(Y_tmp)
for i in range(0,total_sents):
    Y.append(Y_tmp[i]/total_sents)

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
Y = numpy.asanyarray(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)


perm = PermutationImportance(pipeline, random_state=1)

res = perm.fit(X_test,y_test)

#ret = eli5.format_as_text(eli5.explain_weights(perm))
ret = eli5.format_as_dict(eli5.explain_weights(res))

#ret = eli5.show_weights(perm, feature_names = X.columns.tolist())
print(ret)

for i in ret['feature_importances']['importances']:
    print(i)

print('------')
print(perm.feature_importances_)
def genderate_PermutationImportance(X_train, y_train, is_test=True):
    import eli5
    from eli5.sklearn import PermutationImportance
    if is_test == False:
        #         model = LGBMClassifier(**self.params).fit(X_train,y_train)
        model = RandomForestClassifier(n_estimators=500,
                                       class_weight='balanced',
                                       random_state=2019).fit(
                                           X_train, y_train)
        perm_train = PermutationImportance(model, random_state=1).fit(
            X_train, y_train)
        #         eli5.show_weights(perm_train,top=100,feature_names=X_train.columns.tolist())
        #         eli5.show_weights(perm_test,top=100,feature_names=X_test.columns.tolist())
        perm_feature_importance_train = pd.concat([
            pd.Series(X_train.columns),
            pd.Series(perm_train.feature_importances_)
        ],
                                                  axis=1).sort_values(
                                                      by=1, ascending=False)
        perm_feature_importance_train.columns = ['feature', 'imp']
        perm_feature_importance_train = perm_feature_importance_train.reset_index(
            drop=True)
        perm_feature_importance_train.to_csv(
            '../data/perm_feature_importance_train.csv', index=False)

    if is_test == True:
        X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.3)
        model = RandomForestClassifier(n_estimators=500,
                                       class_weight='balanced',
                                       random_state=2019).fit(
                                           X_train, y_train)
        perm_train = PermutationImportance(model, random_state=1).fit(
            X_train, y_train)
        #         eli5.show_weights(perm_train,top=100,feature_names=X_train.columns.tolist())
        #         eli5.show_weights(perm_test,top=100,feature_names=X_test.columns.tolist())
        perm_feature_importance_train = pd.concat([
            pd.Series(X_train.columns),
            pd.Series(perm_train.feature_importances_)
        ],
                                                  axis=1).sort_values(
                                                      by=1, ascending=False)
        perm_feature_importance_train.columns = ['feature', 'imp']
        perm_feature_importance_train = perm_feature_importance_train.reset_index(
            drop=True)
        perm_feature_importance_train.to_csv(
            './data/perm_feature_importance_train.csv', index=False)
        perm_test = PermutationImportance(model,
                                          random_state=1).fit(X_test, y_test)
        perm_feature_importance_test = pd.concat([
            pd.Series(X_test.columns),
            pd.Series(perm_test.feature_importances_)
        ],
                                                 axis=1).sort_values(
                                                     by=1, ascending=False)
        perm_feature_importance_test.columns = ['feature', 'imp']
        perm_feature_importance_test = perm_feature_importance_test.reset_index(
            drop=True)
        perm_feature_importance_test.to_csv(
            './data/perm_feature_importance_test.csv', index=False)
model_predictor = Rand_forest.named_steps['randomforestclassifier']

Rand_pipeline = make_pipeline(
    OrdinalEncoder(), 
    SimpleImputer(strategy='median'))

# fit the model
Rand_pipeline.fit(X_train, y_train)

# transform the model
TT_val = Rand_pipeline.transform(X_val)

model_permuter = PermutationImportance(
    model_predictor,
    scoring='accuracy',
    n_iter=7,
    random_state=42
)

model_permuter.fit(TT_val, y_val);

# eli5 graph with weight and feature with my 14 selecting features
eli5.show_weights(
    model_permuter,
    top=None,
    feature_names=X_val.columns.tolist()
)

"""### Model Interpretation

### Isolated Partial Dependence Plots with 1 feature
Example #18
0
    (RobustScaler(), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (SelectKBest(selection_score_func, k=1), ['<NAME3>']),
    (SelectKBest(selection_score_func, k=2), ['<NAME2>', '<NAME3>']),
    (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)),
                   ('p', SelectPercentile(selection_score_func, 30))
                   ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']),
    (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (VarianceThreshold(1.0), ['<NAME2>']),
    (GenericUnivariateSelect(), ['<NAME2>']),
    (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']),
    (SelectFromModel(LogisticRegression(
        'l1', C=0.01, random_state=42)), ['<NAME0>', '<NAME2>']),
    (SelectFromModel(
        PermutationImportance(
            LogisticRegression(random_state=42),
            cv=5,
            random_state=42,
            refit=False,
        ),
        threshold=0.1,
    ), ['<NAME2>', '<NAME3>']),
    (RFE(LogisticRegression(random_state=42), 2), ['<NAME1>', '<NAME3>']),
    (RFECV(LogisticRegression(random_state=42)),
     ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
    (RandomizedLogisticRegression(random_state=42),
     ['<NAME1>', '<NAME2>', '<NAME3>']),
])
def test_transform_feature_names_iris(transformer, expected, iris_train):
    X, y, _, _ = iris_train
    transformer.fit(X, y)
    # Test in_names being provided
    res = transform_feature_names(transformer,
# https://medium.com/@hupinwei/%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E5%8F%AF%E8%A7%A3%E9%87%8B%E6%80%A7-machine-learning-explainability-%E7%AC%AC%E4%BA%8C%E8%AC%9B-c090149f0772

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# get data
data = pd.read_csv(
    '../input/fifa-2018-match-statistics/FIFA 2018 Statistics.csv')
y = (data['Man of the Match'] == "Yes"
     )  # Convert from string "Yes"/"No" to binary
feature_names = [i for i in data.columns if data[i].dtype in [np.int64]]
X = data[feature_names]

# seperate train and validate set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# setup model and fit
my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)

# special package to test
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names=val_X.columns.tolist())
Example #20
0
# svc = SVC(kernel="linear")
# rfecv = RFECV(estimator=svc)
# rfecv.fit(X, y)
# y_pos = np.arange(len(X.columns))
# plt.bar(y_pos, rfecv.grid_scores_, color=(0.2, 0.4, 0.6, 0.6))
# plt.ylim(0.0, 1.0)
# plt.xlabel('Number of features selected')
# plt.ylabel('Cross validation score (nb of correct classifications)')
# plt.title('Feature Analisys')
# plt.draw()


# Use feature importance
def build_model():
    return base_model(input_dim=len(X.columns))


# evaluate model with standardized dataset LENTOO
# estimator = KerasClassifier(build_fn=build_model, epochs=100, verbose=0)
# kfold = StratifiedKFold(n_splits=10, shuffle=True)
# results = cross_val_score(estimator, features, rpta, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

model = KerasClassifier(build_fn=build_model)
model.fit(X, y, epochs=50, batch_size=128)

perm = PermutationImportance(model, random_state=1).fit(X, y)
print(eli5.show_weights(perm, feature_names=X.columns.tolist()).data)

plt.show()
Example #21
0
def NN_train(filetrain, targetname, setname):
    def warn(*args, **kwargs):
        pass

    import warnings
    warnings.warn = warn

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import os
    from scipy import stats

    from eli5.sklearn import PermutationImportance  #get feature importance per K-fold

    import pickle
    #    from sklearn.externals import joblib
    #ML functions
    from sklearn.neural_network import MLPRegressor
    from sklearn.model_selection import KFold, GridSearchCV

    from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
    import sklearn.preprocessing as skpp

    #%% Import data

    dfp = filetrain

    print(setname + ' column list:')
    print(dfp.columns)
    print('-----------------------------------')

    print('Data imported')

    dfp = dfp[dfp['M500c(41)'] > 13.5]

    dfp = dfp.dropna(axis=1)

    print('New column list:')

    #    targetname=['G3XMgas(80)','G3XMstar(81)' ,'G3XTgas_mw(82)', 'G3XYx(84)', 'G3XYsz(85)']

    dfp.reset_index(drop=True, inplace=True)

    Y_train = dfp[targetname]

    dfptrain = dfp.copy()

    dfptrain.drop(labels=targetname, inplace=True, axis=1)

    coldrop = [col for col in dfptrain.columns if 'G3X' in col]
    dfptrain.drop(labels=coldrop, inplace=True, axis=1)

    new_col_list = dfptrain.columns
    print(new_col_list)
    #plot hist
    plt.figure('Mass hist')
    plt.hist(dfp['M500c(41)'], bins=20, zorder=1, label=[setname + ' set'])

    plt.legend()

    #%% Preprocessing of data

    #    Y_train=np.log10(Y_train)

    #    Ysz_error=Y_train.index[Y_train['G3XYsz(85)'] == -np.inf]
    #    print('This are the index of log10(Ysz)=-inf')
    #    print(Ysz_error)
    #
    #    Y_train.drop(labels=Ysz_error, axis=0, inplace=True)
    #    dfptrain.drop(labels=Ysz_error, axis=0, inplace=True)

    #statistical data from Y_train
    #    mu=np.mean(Y_train) #median
    #    sigma=np.std(Y_train) #standard deviation

    #%% Analysis of train data

    #we add back target data for correlation analysis
    #dfptrain= dfptrain.copy()
    dfptrain[targetname] = Y_train

    corr = dfptrain.corr()

    plt.figure('Correlation matrix - training data', figsize=(9, 9))
    nticks = len(dfptrain.columns)
    plt.xticks(range(nticks), dfptrain.columns, rotation='vertical')
    plt.yticks(range(nticks), dfptrain.columns)
    _ = plt.colorbar(
        plt.imshow(corr,
                   interpolation='nearest',
                   vmin=-1.,
                   vmax=1.,
                   cmap=plt.get_cmap('YlOrBr')))
    plt.title('Correlation matrix - Training data', fontsize=20)
    #plt.savefig('plots/correlation/correlation_plot.png')
    #    plt.show()
    #%% NN on training data - Creation of NN algorithm

    #We get the test/train index
    indexFolds = KFold(n_splits=5, shuffle=True, random_state=11)
    lVarsTarg = dfptrain.columns

    R2_NN = []
    MAE_NN = []
    MSE_NN = []
    tuned_parameters = [
        #                       {'hidden_layer_sizes' :  [(300,200,100)],
        {
            'hidden_layer_sizes': [(20, 20, 20)],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'solver': ['lbfgs']
        }
        #                       'solver' : ['lbfgs', 'sgd', 'adam']}
    ]

    # OG Layer size : [(300,200,100)]
    # Recorremos las particiones
    ind = 0

    Ypred = np.zeros(np.shape(dfptrain[targetname]))
    Ytarg = np.zeros(np.shape(dfptrain[targetname]))

    Feature_mean = np.zeros([
        new_col_list.shape[0],
    ])

    if len(targetname) == 1:
        Ypred = np.ravel(Ypred)
        Ytarg = np.ravel(Ytarg)

    for idxTr, idxTs in indexFolds.split(dfptrain):

        ind = ind + 1
        print()
        print()
        print('K-fold:', ind)

        #Making Min-Max Scaler
        Scaler = skpp.MinMaxScaler()
        X = dfptrain.drop(labels=targetname, axis=1)
        print(X.columns)
        print(X.columns.shape)
        Scaler.fit(X)  #Fit scaler to data, then transform

        y_min = dfptrain[targetname].min(axis=0)
        y_max = dfptrain[targetname].max(axis=0)  #stat data for inv transform

        print('y_min:')
        print(y_min)
        print('y_max:')
        print(y_max)
        '''
            dfp_scaled= (dfp - dfp.min(axis=0)) / (dfp.max(axis=0) - dfp.min(axis=0))
            dfp_inv= dfp_scaled * (dfp.max(axis=0) - dfp.min(axis=0)) + dfp.min(axis=0)
        
            y_min=dfp.min(axis=0)
            y_max=dfp.max(axis=0)
        '''
        dfptrain_old = dfptrain.copy()  #backup
        #        dfptrain_scaled=(dfptrain - dfptrain.min(axis=0)) / (dfptrain.max(axis=0) - dfptrain.min(axis=0))
        dfptrain_X = Scaler.transform(X)
        dfptrain_X = pd.DataFrame(dfptrain_X, columns=X.columns)

        Y = dfptrain[targetname]
        dfptrain_Y = (Y - Y.min(axis=0)) / (Y.max(axis=0) - Y.min(axis=0))

        #        dfptrain_scaled=pd.DataFrame(dfptrain_scaled, columns= dfptrain_old.columns)
        print('Scaling done')
        #Separamos la informacion entre entrenamiento y testeo
        #        X_train = dfptrain_X.values[idxTr,:-len(targetname)]
        #        Y_train = dfptrain_Y.values[idxTr,-len(targetname):]
        #        X_test = dfptrain_X.values[idxTs,:-len(targetname)]
        #        Y_test = dfptrain_Y.values[idxTs,-len(targetname):]
        X_train = dfptrain_X.values[idxTr, :]
        Y_train = dfptrain_Y.values[idxTr, :]
        X_test = dfptrain_X.values[idxTs, :]
        Y_test = dfptrain_Y.values[idxTs, :]
        if len(targetname) == 1:
            Y_train = dfptrain_Y.values[idxTr, -len(targetname)]
            Y_test = dfptrain_Y.values[idxTs, -len(targetname)]

        #Estandariza la informacion quitando la media y escalando a la unidad
#        norm_train = skpp.StandardScaler().fit(X_train) #Normal L2 transform
#        norm_train = skpp.PowerTransformer().fit(X_train) #Power transform to gaussian like

#        X_train = skpp.StandardScaler().fit_transform(X_train) #Normal L2 transform
#        X_train = skpp.PowerTransformer().fit_transform(X_train) #Power transform to gaussian like

#Transform back into dataframe
        X_train = pd.DataFrame(X_train, columns=X.columns)
        Y_train = pd.DataFrame(Y_train, columns=targetname)
        X_test = pd.DataFrame(X_test, columns=X.columns)
        Y_test = pd.DataFrame(Y_test, columns=targetname)
        print('Sets ready')

        #GRID SEARCH ON NEURAL NETWORK
        clf_bp = GridSearchCV(MLPRegressor(max_iter=500),
                              tuned_parameters,
                              cv=5,
                              n_jobs=-1)  #multitasking
        clf_bp.fit(X_train, Y_train)
        print(clf_bp.best_params_)
        hidden_layer_sizes = clf_bp.best_params_[
            'hidden_layer_sizes']  #nos da el mejor parametro
        activation = clf_bp.best_params_['activation']
        solver = clf_bp.best_params_['solver']
        clf = MLPRegressor(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver
        )  #creamos el neural network regressor con ese parametro
        clf.fit(X_train, Y_train)
        score = clf.score(
            X_train, Y_train
        )  #obtenemos el score con los datos de training asociado con ese parametro

        #    feature_importances.append(clf.feature_importances_) #guardamos la importancia de cara parametro en la simulacion
        perm = PermutationImportance(clf).fit(X_test, Y_test)
        perm_weight = perm.feature_importances_
        print(perm_weight.shape)
        feature_imp = {'Feature': X_train.columns, 'Importance': perm_weight}
        feature_imp = pd.DataFrame(feature_imp)
        feature_imp_name = 'feature_importance-kfold_' + str(ind) + '-' + str(
            setname) + '.csv'

        Feature_mean += perm_weight / 5

        #feature_imp.to_csv(feature_imp_name) #export feature importance of dataframe in kfold ind for setname

        print("Score training = ", score)

        score_t = clf.score(X_test, Y_test)  # score del test
        print("Score test = ", score_t)

        y_pred = clf.predict(
            X_test)  #(dfP.values[:,:-1]) #obtenemos las predicciones de X_test
        y_target = Y_test  #dfP.values[:,-1] #Y_tot
        if len(targetname) == 1:
            y_target = np.ravel(y_target)

        Ypred[idxTs, ] = y_pred
        Ytarg[idxTs, ] = y_target

        print('MSE = ', (mean_squared_error(y_target, y_pred)))
        print('MAE = ', (mean_absolute_error(y_target, y_pred)))
        print('R^2 score =', (r2_score(y_target, y_pred)))

        #Guardamos estos resultados
        MSE_NN.append(mean_squared_error(y_target, y_pred))
        MAE_NN.append(mean_absolute_error(y_target, y_pred))
        R2_NN.append(r2_score(y_target, y_pred))

    print()
    print()
    MSE_NN = np.array(MSE_NN)
    MAE_NN = np.array(MAE_NN)
    R2_NN = np.array(R2_NN)

    print('MSE - 5 Folds : ', MSE_NN.mean())
    print('MAE - 5 Folds : ', MAE_NN.mean())
    print('R^2 - 5 Folds : ', R2_NN.mean())

    #    Feature_mean= Feature_mean.mean(axis=0, skipna=True)
    print(Feature_mean)
    Feature_name = 'Feature importance for NN' + str(targetname) + '.csv'
    Features = {'Name': X_train.columns, 'Weight': Feature_mean}
    Features = pd.DataFrame(Features)

    Features.to_csv(Feature_name)
    print('Feature importance exported')

    Ypred_NN = pd.DataFrame(Ypred, columns=targetname)
    Ytarg_NN = pd.DataFrame(Ytarg, columns=targetname)
    #%%
    for target in targetname:
        Ypred_NN[target] = Ypred_NN[target] * (y_max[target] -
                                               y_min[target]) + y_min[target]
        Ytarg_NN[target] = Ytarg_NN[target] * (y_max[target] -
                                               y_min[target]) + y_min[target]
    #name=str(targetname)+'data.pickle'
    #with open(name, 'wb') as f:
    #    pickle.dump([Ypred_RF, Ytarg_RF], f)

    name = 'NN' + str(targetname) + setname + 'monotargetV7.pickle'
    pickle.dump(clf, open('saved_models/' + name, 'wb'),
                protocol=2)  #Export NN algorithm
    #    joblib.dump(clf,name)
    #    pickle.dump(norm_train, open('normalicer'+name, 'wb'), protocol=2) #normalicer for data
    pickle.dump(Scaler,
                open('saved_models/normalicer' + name, 'wb'),
                protocol=2)  #normalicer for data

    pickle.dump([y_max, y_min],
                open('saved_models/statdata' + name, 'wb'),
                protocol=2)
    #with open(name, 'wb') as f:
    #    pickle.dump(clf, f)

    #target_NN=['G3XMgas_NN(80)','G3XMstar_NN(81)' ,'G3XTgas_mw_NN(82)', 'G3XYx_NN(84)', 'G3XYsz_NN(85)']

    Y_NN = pd.DataFrame(data=Ypred_NN.values, columns=targetname)

    dfptrain_old = pd.concat([dfptrain_old, Y_NN], axis=1)
    dfptrain_old_name = 'NN_' + setname + 'V7'
    dfptrain_old.to_csv(dfptrain_old_name)

    print('Data exported')
    #%% training plotting

    #    f1=plt.figure('Mgas NN'+ setname)
    #    plt.scatter(Ytarg_NN['G3XMgas(80)'].values, Ypred_NN['G3XMgas(80)'].values,marker='o', s=(72./f1.dpi)**2,lw=0)
    #    plt.plot(np.linspace(min(Ytarg_NN['G3XMgas(80)'].values), max(Ytarg_NN['G3XMgas(80)'].values)), \
    #             np.linspace(min(Ytarg_NN['G3XMgas(80)'].values), max(Ytarg_NN['G3XMgas(80)'].values)), '-r' )
    #    plt.title('Mgas - NN vs real '+ setname)
    #    plt.xlabel('Mgas real - log scale')
    #    plt.ylabel('Mgas NN - log scale')
    #    f1.savefig('Mgas NN'+ setname + ".pdf", bbox_inches='tight')
    #    plt.close()
    #
    #    f2=plt.figure('Mstar NN'+ setname)
    #    plt.scatter(Ytarg_NN['G3XMstar(81)'].values, Ypred_NN['G3XMstar(81)'].values,marker='o', s=(72./f2.dpi)**2,lw=0)
    #    plt.plot(np.linspace(min(Ytarg_NN['G3XMstar(81)'].values), max(Ytarg_NN['G3XMstar(81)'].values)), \
    #             np.linspace(min(Ytarg_NN['G3XMstar(81)'].values), max(Ytarg_NN['G3XMstar(81)'].values)), '-r' )
    #    plt.title('Mstar - NN vs real ' + setname)
    #    plt.xlabel('Mstar real - log scale')
    #    plt.ylabel('Mstar NN - log scale')
    #    f2.savefig('Mstar NN'+ setname+ ".pdf", bbox_inches='tight')
    #    plt.close()
    #
    #    f3=plt.figure('Tgas NN'+ setname)
    #    plt.scatter(Ytarg_NN['G3XTgas_mw(82)'].values, Ypred_NN['G3XTgas_mw(82)'].values,marker='o', s=(72./f3.dpi)**2,lw=0)
    #    plt.plot(np.linspace(min(Ytarg_NN['G3XTgas_mw(82)'].values), max(Ytarg_NN['G3XTgas_mw(82)'].values)), \
    #             np.linspace(min(Ytarg_NN['G3XTgas_mw(82)'].values), max(Ytarg_NN['G3XTgas_mw(82)'].values)), '-r' )
    #    plt.title('Tgas - NN vs real ' + setname)
    #    plt.xlabel('Tgas real - log scale')
    #    plt.ylabel('Tgas NN - log scale')
    #    f3.savefig('Tgas NN'+ setname+ ".pdf", bbox_inches='tight')
    #    plt.close()
    #
    #    f4=plt.figure('G3XYx NN'+ setname)
    #    plt.scatter(Ytarg_NN['G3XYx(84)'].values, Ypred_NN['G3XYx(84)'].values,marker='o', s=(72./f4.dpi)**2,lw=0)
    #    plt.plot(np.linspace(min(Ytarg_NN['G3XYx(84)'].values), max(Ytarg_NN['G3XYx(84)'].values)), \
    #             np.linspace(min(Ytarg_NN['G3XYx(84)'].values), max(Ytarg_NN['G3XYx(84)'].values)), '-r' )
    #    plt.title('Yx - NN vs real ' + setname)
    #    plt.xlabel('G3XYx real - log scale')
    #    plt.ylabel('G3XYx NN - log scale')
    #    f4.savefig('G3XYx NN'+ setname+ ".pdf", bbox_inches='tight')
    #    plt.close()
    #
    #    f5=plt.figure('G3XYsz NN'+setname)
    #    plt.scatter(Ytarg_NN['G3XYsz(85)'].values, Ypred_NN['G3XYsz(85)'].values,marker='o', s=(72./f5.dpi)**2,lw=0)
    #    plt.plot(np.linspace(min(Ytarg_NN['G3XYsz(85)'].values), max(Ytarg_NN['G3XYsz(85)'].values)), \
    #             np.linspace(min(Ytarg_NN['G3XYsz(85)'].values), max(Ytarg_NN['G3XYsz(85)'].values)), '-r' )
    #    plt.title('Yx - NN vs real '+setname)
    #    plt.xlabel('G3XYsz real - log scale')
    #    plt.ylabel('G3XYsz NN - log scale')
    #    f5.savefig('G3XYsz NN'+setname+ ".pdf", bbox_inches='tight')
    #    plt.close()

    return
Example #22
0
                    epochs=200,
                    verbose=0)
#Change dataallwo to dataall to include census data

print("NN Average RMSE: ", np.average(history.history['loss']))

print("NN Average Normalized RMSE: ",
      np.average(history.history['loss']) / (max(yLabels) - min(yLabels)))

#%%
#Model evaluation

evaltest = model.evaluate(x, y, batch_size=1)
#print('Accuracy: %.2f' % (accuracy*100))
print(evaltest)

#%%

#Plot loss over epochs

plt.plot(history.history['val_loss'])
plt.show()

#%%

#Not used in preliminary results

permut = PermutationImportance(model,
                               scoring="accuracy").fit(testdata, yLabels)
eli5.show_weights(permut, feature_names=dataall.columns.tolist())
Example #23
0
    'random_state': [0],
}

# Instantiate the grid search model
hyperp_srch = GridSearchCV(estimator=rf_model,
                           param_grid=group_param,
                           cv=5,
                           return_train_score=False)

hyperp_srch.fit(x_train, y_train)
#print(hyperp_srch.best_params_)
best_hyper = hyperp_srch.best_estimator_
rf_model = RandomForestClassifier(**best_hyper.get_params())
rf_model.fit(x_train, y_train)

y_pred_train = rf_model.predict(x_train)
y_pred_val = rf_model.predict(x_val)

## End
print('Classification Report: \n')
print(classification_report(y_val, y_pred_val))
print('\nConfusion Matrix: \n')
print(confusion_matrix(y_val, y_pred_val))

permutation = PermutationImportance(rf_model,
                                    random_state=2).fit(x_train, y_train)
eli5.explain_weights(permutation, feature_names=x.columns.tolist())
print(
    eli5.format_as_text(
        eli5.explain_weights(permutation, feature_names=x.columns.tolist())))
Example #24
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

# Define the model.
regressor = RandomForestRegressor(n_estimators=1000,
                                  random_state=42,
                                  max_depth=12,
                                  max_samples=None)

# Fit to the data.
regressor.fit(x_train, y_train)

# Print both types of feature importance.
perm = PermutationImportance(regressor, random_state=42).fit(x_test, y_test)
print("Feature importances using permutation", perm.feature_importances_)

print("Feature importances using MDI ", regressor.feature_importances_)

# Calculate the average percentage error.
predictions = regressor.predict(x_test)
mean_perc_error = np.average((np.abs(y_test - predictions) * 100 / y_test))
print("Average percentage error ", mean_perc_error)

all_predictions = regressor.predict(x)
data["predictions"] = all_predictions

# select one speed for plotting
speed = 3000
test = data[data.rpm == speed]
Example #25
0
        test_size=0.2,
        random_state=times)

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    total_predict = np.zeros(len(y_test))

    for i in range(len(MLA)):

        skf = StratifiedKFold(n_splits=5, random_state=times)

        clf = copy.deepcopy(MLA[i])
        clf.random_state = times
        sel = SelectFromModel(
            PermutationImportance(clf, cv=skf,
                                  random_state=times)).fit(x_train, y_train)
        x_train_trans = sel.transform(x_train)
        x_test_trans = sel.transform(x_test)

        vali_auc = np.mean(
            cross_val_score(clf,
                            x_train_trans,
                            y_train,
                            cv=skf,
                            scoring='roc_auc'))

        clf.fit(x_train_trans, y_train)
        predict_result = clf.predict_proba(x_test_trans)[:, 1]
        total_predict += predict_result

        test_auc = roc_auc_score(y_test, predict_result)
Example #26
0
#                              num_leaves=13,
#                              max_depth=5,
#                              learning_rate=0.01,
#                              min_split_gain=0,
#                              min_child_samples=2,
#                              colsample_bytree=0.4,
#                              objective='binary',
#                              random_state=42,
#                              eval_metric='roc_auc',
#                              n_jobs=-1)

shuffle_verify(X, y, lgb_clf)

# Permutation Importance

perm = PermutationImportance(lgb_clf, random_state=42).fit(test_X, test_y)

eli5.show_weights(perm, feature_names=feature_names)

# Partial Dependence PLots - outliers make it difficult to see.

def pdp_plotter(feature, model):
    pdp_feat = pdp.pdp_isolate(model=lgb_clf,
                               dataset=test_X,
                               model_features=feature_names,
                               feature=feature)
    pdp.pdp_plot(pdp_feat, feature)
    plt.show()


pdp_plotter('service_to_uza_area', lgb_clf)
rmse.append(np.sqrt(mean_squared_error(val_pred,Y_val)))
r2 = []
r2.append(r2_score(val_pred,Y_val))
d={'RMSE':rmse}   
d1={'R2': r2}
print(d,d1)


# #### This model is pretty good since we have an R squared value close to 1 and very low RMSE value but lets try to optimize it

# In[80]:


import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=1).fit(X_train, Y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist())


# In[ ]:


# Here we can see that the features that have the biggest impact of predicting the GDP per capita value are "Population" and "HDI"
# So we are going to take those two features now for our new model


# In[81]:


X_new = df_std[["POP", "HDI"]]
Example #28
0
        # Performance
        best_adj_r2 = adj_r2(best_r2, n, p)
        r2_dict[model_filename] = best_adj_r2

        # Permutation Importance
        X_data = pd.read_csv(data_folder /
                             'X_varGroup{}.csv'.format(model_filename[-8]),
                             index_col='t10_cen_uid_u_2010',
                             dtype={'t10_cen_uid_u_2010': object})
        y_data = pd.read_csv(data_folder /
                             'y_{}.csv'.format(model_filename[-6:-4]),
                             index_col='t10_cen_uid_u_2010',
                             dtype={'t10_cen_uid_u_2010': object},
                             squeeze=True)
        perm = PermutationImportance(pipe, scoring='r2') \
            .fit(X_data.values, y_data.values, cv='prefit')

        perm_results = np.mean(np.array(perm.results_), axis=0)
        perm_df = pd.DataFrame({
            # 'feature': [x[8:] for x in colnames[model_filename[-8]]],
            'feature': X_data.columns.tolist(),
            'importance': perm_results
        }) \
            .sort_values('importance', ascending=False) \
            .set_index('feature')

        # Coefficients
        features_final = pipe.named_steps['poly'].get_feature_names(
            colnames[model_filename[-8]])
        coef_df = pd.DataFrame.from_dict(
            {
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0,
                      n_estimators=29, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

model1.fit(X_train_transformed, y_train)

# Get permutation importances
! pip install eli5
from eli5.sklearn import PermutationImportance
import eli5

permuter = PermutationImportance(
    model1,
    scoring='r2',
    n_iter=2,
    random_state=42
)

permuter.fit(X_val_transformed, y_val)
feature_names = X_val.columns.tolist()

eli5.show_weights(
    permuter,
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

from sklearn.metrics import mean_squared_error, r2_score

# Coefficient of determination r2 for the training set
    
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model


#sk_params=[]

batch_size=32
nb_epoch=20
#my_model = KerasClassifier(build_fn=base_model)    
classifier = KerasClassifier(build_fn = base_model,validation_split=0.2,batch_size=batch_size,shuffle=True,epochs=nb_epoch,verbose=1,callbacks=callbacks_list)
classifier.fit(X_train, y_train)

perm = PermutationImportance(classifier, random_state=1,n_iter=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = ['TantraLines', 'TantraHits', 'Mestimator', 'TantraZenith',
       'TantraAzimuth', 'TantraAngularEstimator', 'TantraX', 'TantraY',
       'TantraZ', 'Lambda', 'Beta', 'TrackLength', 'TantraEnergy', 'TantraRho',
       'IntegralCharge', 'MeanCharge', 'StdCharge', 'TriggerCounter',
       'GridQuality', 'AAZenith', 'AAAzimuth', 'Trigger3N', 'TriggerT3',
       'NOnTime'])

perm_train_feat_imp_df = pd.DataFrame({'val': perm.results_[0],
                                      'lab':['TantraLines', 'TantraHits', 'Mestimator', 'TantraZenith',
       'TantraAzimuth', 'TantraAngularEstimator', 'TantraX', 'TantraY',
       'TantraZ', 'Lambda', 'Beta', 'TrackLength', 'TantraEnergy', 'TantraRho',
       'IntegralCharge', 'MeanCharge', 'StdCharge', 'TriggerCounter',
       'GridQuality', 'AAZenith', 'AAAzimuth', 'Trigger3N', 'TriggerT3','NOnTime'] } )
perm_train_feat_imp_df.plot.barh(x='lab', y='val')