ens.score(X_val, y_val) #Regression from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeClassifier ens = BaggingRegressor(DecisionTreeRegressor(random_state=101)) ens.fit(X_train, y_train) ens.score(X_val, y_val) #Once you are confident about your final model, measure its performance on the test set to estimate the generalization error #Model interpretability #Feature importance import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(model, random_state=101).fit(X_val, y_val) eli5.show_weights(perm, feature_names=X_val.columns.tolist()) #Partial dependence plot #New integration in sklearn, might not work with older versions from sklearn.inspection import partial_dependence, plot_partial_dependence partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')]) plot_partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')]) #With external module for legacy editions from pdpbox import pdp, get_dataset, info_plots #Create the data that we will plot pdp_goals = pdp.pdp_isolate(model=model, dataset=X_val,
('p', SelectPercentile(selection_score_func, 30)) ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']), (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (VarianceThreshold(1.0), ['<NAME2>']), (GenericUnivariateSelect(), ['<NAME2>']), (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']), (SelectFromModel( LogisticRegression('l1', C=0.01, solver='liblinear', random_state=42, multi_class='ovr')), ['<NAME0>', '<NAME2>']), (SelectFromModel( PermutationImportance( LogisticRegression(solver='liblinear', random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['<NAME2>', '<NAME3>']), (RFE( LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), 2), ['<NAME1>', '<NAME3>']), (RFECV(LogisticRegression( solver='liblinear', random_state=42, multi_class='ovr'), cv=3), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), ] + _additional_test_cases) def test_transform_feature_names_iris(transformer, expected, iris_train): X, y, _, _ = iris_train transformer.fit(X, y)
def sk_process(df_train, param, message, df_test=None, trial=None, is_output_feature_importance=False, trial_level=0): """ >>>param = { >>> 'columns': columns, >>> 'cv': { >>> 'cls': 'KFold', >>> 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42} >>> }, >>> 'scaler': { >>> 'cls': 'StandardScaler', 'init': {}, 'fit': {} >>> },>>> >>> 'model': { >>> 'cls': 'lgb.LGBMRegressor', >>> 'init': { >>> 'learning_rate': 0.35395923077843333, >>> 'feature_fraction': 0.8840483697334669, >>> 'bagging_fraction': 0.7017457378676857, >>> 'min_data_in_leaf': 616, >>> 'lambda_l1': 0.00013058988949929333, >>> 'lambda_l2': 0.004991992636437704, >>> 'max_bin': 74, >>> 'num_leaves': 64, >>> 'random_state': 2928, >>> 'n_jobs': 16 >>> }, >>> 'fit': {} >>> }, >>> 'metric': 'mean_absolute_error' >>>} :param df_train: :param param: :param message: :param df_test: :param trial: :param is_output_feature_importance: :param trial_level: :return: """ columns = param['columns'] assert 'y' in df_train.columns.tolist(), 'y is not in df_train' assert 'index' in df_train.columns.tolist(), 'index is not in df_train' assert 'index' not in param['columns'], 'index is in features' assert 'y' not in param['columns'], 'y is in features' assert 'label' not in param['columns'], 'label is in features' assert 'group' not in param['columns'], 'group is in features' assert (type(trial) == list) | (trial == None), 'trial is neither list nor none' assert len(columns) != 0, 'columns size is 0' df_test_pred = None if type(df_test) == pd.DataFrame: assert 'index' in df_test.columns.tolist(), 'index is not in df_test' df_test_pred = pd.concat([df_test_pred, df_test[['index']]], axis=1) CV = processutil._str2class(param['cv']['cls']) MODEL = processutil._str2class(param['model']['cls']) if 'scaler' in param: SCALER = processutil._str2class(param['scaler']['cls']) metric = processutil._str2class(param['metric']) history = [] df_valid_pred = pd.DataFrame() df_feature_importances_i_list = [] # StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold if 'splits' in param['cv']: splits = param['cv']['splits'] else: cv = CV(**param['cv']['init']) if param['cv']['cls'] == 'StratifiedKFold': assert 'label' in df_train.columns.tolist( ), 'label is not in df_train' splits = list(cv.split(df_train, df_train['label'])) elif param['cv']['cls'] == 'GroupKFold': assert 'group' in df_train.columns.tolist( ), 'group is not in df_train' splits = list(cv.split(df_train, groups=df_train['group'])) else: splits = list(cv.split(df_train)) for fold_n, (train_index, valid_index) in enumerate(splits): X_train, X_valid = df_train[columns].values[ train_index, :], df_train[columns].values[valid_index, :] y_train, y_valid = df_train['y'].values[train_index], df_train[ 'y'].values[valid_index] if 'scaler' in param: scaler = SCALER(**param['scaler']['init']) X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) model = MODEL(**param['model']['init']) model.fit(X_train, y_train, **param['model']['fit']) y_valid_pred = model.predict(X_valid) y_train_pred = model.predict(X_train) original_index = df_train['index'].values[valid_index] df_valid_pred_i = pd.DataFrame \ ({'index': original_index, 'predict': y_valid_pred, 'fold_n': np.zeros(y_valid_pred.shape[0]) + fold_n}) df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0) if is_output_feature_importance: df_feature_importances_i = pd.DataFrame({ 'feature': columns, 'model_weight': model.feature_importances_ }) df_feature_importances_i = df_feature_importances_i.sort_values( by=['feature']) df_feature_importances_i = df_feature_importances_i.reset_index( drop=True) perm = PermutationImportance(model, random_state=42).fit( X_valid, y_valid) df_feature_importances_i2 = eli5.explain_weights_dfs( perm, feature_names=columns, top=len(columns))['feature_importances'] df_feature_importances_i2 = df_feature_importances_i2.sort_values( by=['feature']) df_feature_importances_i2 = df_feature_importances_i2.reset_index( drop=True) df_feature_importances_i = pd.merge(df_feature_importances_i, df_feature_importances_i2, on='feature') df_feature_importances_i_list.append(df_feature_importances_i) if type(df_test) == pd.DataFrame: X_test = df_test[columns].values if 'scaler' in param: X_test = scaler.transform(X_test) y_test_pred = model.predict(X_test) df_test_pred_i = pd.DataFrame({fold_n: y_test_pred}) df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=1) history.append \ ({'fold_n': fold_n, 'train': metric(y_train, y_train_pred, group=df_train['group']), 'valid': metric(y_valid, y_valid_pred, group=df_train['group'])}) df_his = pd.DataFrame(history) df_feature_importances = None if is_output_feature_importance: df_feature_importances = df_feature_importances_i_list[0] for idx, df_feature_importances_i in enumerate( df_feature_importances_i_list[1:]): df_feature_importances = pd.merge(df_feature_importances, df_feature_importances_i, on='feature', suffixes=('', idx + 1)) df_valid_pred = df_valid_pred.sort_values(by=['index']) df_valid_pred = df_valid_pred.reset_index(drop=True) if type(df_test) == pd.DataFrame: df_test_pred = df_test_pred.sort_values(by=['index']) df_test_pred = df_test_pred.reset_index(drop=True) if type(trial) == list: datetime_ = datetime.datetime.now() val_metric_mean = np.mean(df_his.valid) val_metric_std = np.std(df_his.valid) train_metric_mean = np.mean(df_his.train) train_metric_std = np.std(df_his.train) trial_i_d_ = { 'datetime': datetime_, 'message': message, 'val_metric_mean': val_metric_mean, 'train_metric_mean': train_metric_mean, 'val_metric_std': val_metric_std, 'train_metric_std': train_metric_std, 'trn_val_metric_diff': val_metric_mean - train_metric_mean, 'df_feature_importances': df_feature_importances, 'param': param.copy(), 'nfeatures': len(columns) } if trial_level > 0: trial_i_d_ = { 'df_his': df_his, 'df_valid_pred': df_valid_pred, 'df_test_pred': df_test_pred, **trial_i_d_ } trial.append(trial_i_d_) return df_his, df_feature_importances, df_valid_pred, df_test_pred
X = df.drop(columns=['target', 'ID_code']) y = df.target # In[6]: #split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # In[19]: #importance of the feature clf = RandomForestClassifier(random_state=0, n_jobs=1).fit(X_train, y_train) perm = PermutationImportance(clf, random_state=1).fit(X_test, y_test) weight = eli5.show_weights(perm, feature_names=X_test.columns.tolist(), top=100) # In[44]: weight # In[119]: #select feature df_train = df.loc[:199999, [ 'var_81', 'var_26', 'var_44', 'var_110', 'var_109', 'var_190', 'var_78', 'var_21', 'var_1', 'var_99', 'var_133', 'var_166', 'var_34', 'var_148', 'var_122', 'var_139', 'var_164', 'var_12', 'var_165', 'var_119', 'var_76',
# - it has a good mean score # - it has a low variance # ### Fine-tune the model # This part will be implemented soon # I was reading about how to select good feature from [here](https://www.kaggle.com/dansbecker/permutation-importance?utm_medium=email&utm_source=mailchimp&utm_campaign=ml4insights) so I decided to try it now that I can't add features on myself, so let's do it. # In[ ]: import eli5 from eli5.sklearn import PermutationImportance log_reg.fit(learning_data, labels) perm_imp = PermutationImportance(log_reg, random_state=1).fit(learning_data, labels) eli5.show_weights(perm_imp, feature_names=COLUMNS) # The features are ordered by impact on the model, so the Sex feature has the biggest impact on our model. # # I repeated this process many time and tried to combine features to end up with adding is_child_and_sex feature. # ### Run on test data # In[ ]: test_set = pd.read_csv("../input/test.csv") pred = pipeline.fit_transform(test_set) # In[ ]:
y_expert_stand = scaler.transform(y_expert) LR = LinearRegression() lr = LR.fit(x_train_stand, y_train_stand) y_pred = lr.predict(x_test_stand) m1 = mean_squared_error(y_test_stand, y_pred) m2 = mean_squared_error(y_test_stand, y_expert_stand) print('Linear Regression Model\'s MSE is', m1) print('Expert Guess\'s MSE is', m2) # In[51]: perm = PermutationImportance(lr, random_state=i_min).fit(x_test_stand, y_test_stand) eli5.show_weights(perm, feature_names = x_test.columns.tolist(), top=50) # ### SGD Model # In[52]: MSE_vec_sgd = np.zeros(N_bs) # In[53]: for bs_ind in range(N_bs):
data1_x_bin[:10].to_csv("testing_data.csv") list(data1_x_bin[:10].columns) str(lr.predict(data1_x_bin[:10])) """# Model Interpretation""" dtree = DecisionTreeClassifier() dtree.fit(X_train, y_train) dtree.predict(X_test) """## Eli5""" perm = PermutationImportance(dtree , random_state=101).fit(X_test, y_test) # Evaluate the permutation importance eli5.show_weights(perm, feature_names = X_test.columns.values) """## Shap""" row_to_show = 7 # The row for which we want to check the SHAP explanations data_to_predict = X_test.iloc[row_to_show] data_to_preddict_array = data_to_predict.values.reshape(1,-1) dtree_pred = dtree.predict_proba(data_to_preddict_array) dtree.predict(data_to_preddict_array) # Object that can calculate Shap values explainer = shap.TreeExplainer(dtree) # SHAP Tree Explainer
all_dataset = dataset.shuffle(len(df)).batch(1) test_dataset = all_dataset.take(500) train_dataset = all_dataset.skip(500) def get_compiled_model(): model = tf.keras.Sequential([ tf.keras.layers.Dense(4, activation='sigmoid'), tf.keras.layers.Dense(10, activation='sigmoid'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error']) return model model = get_compiled_model() model.fit(train_dataset, epochs=2) test_loss, test_acc = model.evaluate(test_dataset, verbose=2) import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(model, random_state=1, scoring="neg_mean_squared_error").fit( X, target.values) print(eli5.explain_weights(perm, activities))
y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) print("True positives: {}\nFalse positives: {}".format(cm[0, 0], cm[0, 1])) print("True negatives: {}\nFalse negatives: {}".format(cm[1, 1], cm[1, 0])) # visualize confusion matrix with seaborn heatmap cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], index=['Predict Positive:1', 'Predict Negative:0']) sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu') X_list = X_test.columns.tolist() clf = xgb.XGBClassifier(n_estimators=150, random_state=2020) clf.fit(X_train, y_train) perm = PermutationImportance(clf, random_state=2010) perm.fit(X_test, y_test) # Store feature weights in an object html_obj = eli5.show_weights(perm, feature_names=X_list) # Write html object to a file (adjust file path; Windows path is used here) with open( r'C:\Users\lukem\Desktop\Github AI Projects\Higgs-Boson-machine-learning-challenge\boson-importance.htm', 'wb') as f: f.write(html_obj.data.encode("UTF-8")) lr = LogisticRegression() lr.fit(X_train, y_train) pred = lr.predict(X_test) mae = mean_absolute_error(y_test, pred) print(f"logistic regression, mae: {mae}")
vals = ax.get_yticks() ax.set_ylim(0.2875, 0.2925) ax.set_yticklabels(["{:,.2%}".format(i) for i in vals]) plt.show() ########################################## n = 520 rf = RandomForestRegressor(n_estimators=n, random_state=0, n_jobs=multiprocessing.cpu_count(), bootstrap=False) rf.fit(x, y) #SKLean uses Gini Importance by default GI = pd.DataFrame(data=[tree.feature_importances_ for tree in rf.estimators_], columns=x.columns) ### Using eli5 to compute permutation accuracy importance on fitted random forest perm = PermutationImportance(rf, cv="prefit", n_iter=10).fit(x.values, y.values) # Permutation Accuracy Importance PI = pd.DataFrame(data=perm.results_, columns=x.columns) #Rename columns to conform to formulae used in paper formula = { 'considered-farm-plots': "$S$", 'compare_quality': '$F_{Qual}$', 'compare_distance': '$F_{Dist}$', 'homophily_age': '$F_{HAge}$', 'desire_migration': '$F_{Mig}$', 'compare_yeild': '$F_{Yield}$', 'homophily_agricultural_productivity': '$F_{HAgri}$', 'compare_dryness': '$F_{Dry}$', 'compare_water_availability': '$F_{Water}$', 'desire_social_presence': '$F_{Soc}$' }
def classification(self,cleaned_Data_frm1, cleaned_Data_frm,y,cursor ,conn): try: Modles_reuslts =[] Names = [] print("Model building") float_cols = self.float_col result = pd.concat([cleaned_Data_frm1,cleaned_Data_frm,y,float_cols], axis=1) self.data_sorted1 = result.loc[:,~result.columns.duplicated()] self.data_sorted2 = self.data_sorted1.sort_values(self.i) self.data_sorted = self.data_sorted2.dropna(thresh=self.data_sorted2.shape[0]*0.5,how='all',axis=1) self.data_sorted = self.data_sorted.dropna() new_list = [list(set(self.data_sorted.columns).difference(self.x.columns))] X = self.data_sorted.drop([self.i],axis=1) print(X.shape) Y = self.data_sorted[self.i] print(Y.unique()) X= X.fillna(X.mean()) y = (', '.join(["%s" % self.i])) print(y) cols = list(self.data_sorted.columns) x = cols x.remove(y) # List of pipelines for ease of iteration l = 0 access_key_id = self.access_key_id secret_access_key = self.secret_access_key models = ['Random Forest','KNN','XGB','SVC'] if 'sklearn'== (', '.join(["%s" % self.sklearn])): print("good") def sklearn(X,Y,algos): model = models[algos] X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42) X_train, X_test = train_test(X_train, X_test) gd = RandomizedSearchCV(self.Classifier[algos],self.Classifiers_grids[algos],cv = 5, n_jobs=-1, verbose=True,refit = True) gd.fit(X_train, y_train) grid = gd.best_params_ estimator = gd.best_estimator_ y_pred=gd.predict(X_test) cm =confusion_matrix(y_test, y_pred) target = self.i Accuracy = metrics.accuracy_score(y_test, y_pred) print(cm) print(grid) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) if model=='KNN': perm = PermutationImportance(gd, random_state=1).fit(X_train,y_train) importances = perm.feature_importances_ DB_upload(Accuracy,X_train,X_test,y_test,y_pred, importances,grid,estimator,l,cm,target,model) elif model == 'SVC': importances = gd.best_estimator_.coef_ imp = importances.tolist() importances = imp[0] DB_upload(Accuracy,X_train,X_test,y_test,y_pred, importances,grid,estimator,l,cm,target,model) else: importances = gd.best_estimator_.feature_importances_.tolist() #create a feature list from the original dataset (list of columns) # What are this numbers? Let's get back to the columns of the original dataset feature_list = list(X_train.columns) #create a list of tuples feature_importance= sorted(zip(importances, feature_list), reverse=True) DB_upload(Accuracy,X_train,X_test,y_test,y_pred, importances,grid,estimator,l,cm,target,model) return Accuracy sklearn(X,Y,algos) elif 'ai' == (', '.join(["%s" % self.ai])): print('H2o') def H2o(x,y,X,Y): df = h2o.H2OFrame(self.data_sorted) train, test = df.split_frame(ratios=[.8]) train[y] = train[y].asfactor() test[y] = test[y].asfactor() X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42) print(X_train.shape) # Run AutoML for 20 base models (limited to 1 hour max runtime by default) aml = H2OAutoML(max_models=5, seed=1) aml.train(x=x, y=y, training_frame=train) # View the AutoML Leaderboard lb = aml.leaderboard print(lb.head(rows=lb.nrows)) m = h2o.get_model(lb[2,"model_id"]) data_as_list = h2o.as_list(m, use_pandas=False) return lb H2o(x,y ,X,Y) else: print('Dnn') if self.types == 'Classification_problem': def DNN(): model = Sequential() model.add(Dense(512, input_dim=X_train.shape[1], init='normal', activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(32, init='normal', activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(1, init='normal', activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy']) return model X = self.data_sorted.drop([self.i],axis=1) Y = self.data_sorted[self.i] X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2,random_state = 42) X_train, X_test = train_test(X_train, X_test) classifier = KerasClassifier(build_fn=DNN, verbose=1) batch_size = [10 ,20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) estimator = grid.best_estimator_ Accuracy= grid_result.best_score_ print("%s" % (estimator)) y_pred=grid.predict(X_test) perm = PermutationImportance(grid, scoring='accuracy', random_state=1).fit(X_train,y_train) print(perm.feature_importances_) DB_upload(Accuracy,X_train,X_test,y_test, None,importances,grid,estimator,l, cm,target,model) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) else: a = np.unique(self.y) a.sort() b=a[-1] b +=1 def DNN(dropout_rate=0.0, weight_constraint=0): # create model model = Sequential() model.add(Dense(42, input_dim=X_train.shape[1], kernel_initializer='uniform', activation='relu', kernel_constraint=maxnorm(weight_constraint))) model.add(Dropout(dropout_rate)) model.add(Dense(20, kernel_initializer='uniform', activation='relu')) model.add(Dense(b,activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model classifier = KerasClassifier(build_fn=DNN, epochs=10, batch_size=10, verbose=1) weight_constraint = [1] #2, 3, 4, 5] dropout_rate = [0.0]#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) estimator = grid.best_estimator_ Accuracy= grid_result.best_score_ y_pred=grid.predict(X_test) print(y_pred) DB_upload(Accuracy,X_train,X_test,y_test,None,importances,grid,estimator,l, cm,target,model) print("%s" % (estimator)) except: print('Regression model building failed')
def test_estimator_type(): perm = PermutationImportance(LogisticRegression(), cv=3) assert is_classifier(perm) perm = PermutationImportance(RandomForestRegressor(), cv=3) assert is_regressor(perm)
def test_invalid_params(): with pytest.raises(ValueError): reg = PermutationImportance(SVR(), cv="hello")
def main_run_linear_models(train_ds, val_ds, test_ds, data_props, max_backlooking=None, layer_type='dense', activation_funcs=['sigmoid', 'relu', 'tanh'], max_serach_iterations=200, NN_max_depth=3, MAX_EPOCHS=800, patience=25, model_name='linear', examples=None, return_permutation_importances=True, redo_serach_best_model=False): mlflow.set_experiment(model_name) experiment_date_time = int( datetime.datetime.now().strftime("%Y%m%d%H%M%S")) flatten_input = True if layer_type == 'dense' else False def _extract_just_important_data_props(data_props): kwargs = {} kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][ 'cols_just_these'] kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][ 'cols_drop'] kwargs['dataset_cols_y'] = data_props['third_filter'][ 'y_cols_just_these'] kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset']) kwargs['dataset_hash_first'] = data_props['first_step_data_hash'] kwargs['dataset_hash_second'] = data_props['second_step_data_hash'] kwargs['dataset_split_method'] = data_props['second_step'][ 'split_method'] kwargs['dataset_split_steps_train'] = data_props['second_step'][ 'split_props']['train_time_steps'] kwargs['dataset_split_steps_val'] = data_props['second_step'][ 'split_props']['val_time_steps'] kwargs['dataset_split_steps_test'] = data_props['second_step'][ 'split_props']['test_time_steps'] kwargs['dataset_iter_step'] = data_props['iter_step'] kwargs['dataset_normalization'] = data_props['second_step'][ 'normalize_method'] kwargs['dataset_window_backlooking'] = data_props['first_step'][ 'window_input_width'] kwargs['dataset_window_prediction'] = data_props['first_step'][ 'window_pred_width'] kwargs['dataset_window_shift'] = data_props['first_step'][ 'window_shift'] return kwargs def _hp_tranform_param_dict(param_dict): new_param_dict = {} for key, value in param_dict.items(): if type(value) == list: new_param_dict[key] = hp.choice(key, value) elif type(value) == set: new_param_dict[key] = hp.uniform(key, *values) else: new_param_dict[key] = value return new_param_dict max_backlooking = data_props['first_step'][ 'window_input_width'] if max_backlooking is None else max_backlooking param_grid = dict( n_layers=list(range(1, NN_max_depth + 1)), first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8], last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4], activation_func=activation_funcs, backlooking_window=list(range(1, max_backlooking + 1))) hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid) hp_param_dict['model_name'] = model_name hp_param_dict['data_props'] = data_props hp_param_dict['layer_type'] = layer_type def _optimize_objective(*args, **kwargs): if args != (): kwargs = args[ 0] # if positional arguments expect first to be dictionary with all kwargs if type(kwargs) != dict: raise Exception( f'kwargs is not dict - it is {type(kwargs)} with values: {kwargs}' ) backlooking_window = kwargs.pop('backlooking_window') n_layers = kwargs.pop('n_layers') first_layer_nodes = kwargs.pop('first_layer_nodes') last_layer_nodes = kwargs.pop('last_layer_nodes') activation_func = kwargs.pop('activation_func') return_everything = kwargs.pop('return_everything', False) verbose = kwargs.pop('verbose', 0) model_name = kwargs.pop('model_name', 'linear') data_props = kwargs.pop('data_props') layer_type = kwargs.pop('layer_type', 'dense') dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=backlooking_window) now = datetime.datetime.now() date_time = str(now.strftime("%y%m%d%H%M%S")) model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}" kwargs = dict( model_name=model_name, n_layers=n_layers, first_layer_nodes=first_layer_nodes, last_layer_nodes=last_layer_nodes, activation_func=activation_func, input_size=dataset['input_shape'] if layer_type == 'dense' else tuple(list(train_ds.element_spec[0].shape)[1:]), output_size=dataset['output_shape'], backlooking_window=backlooking_window, layer_type=layer_type) model = createmodel(**kwargs) history, mlflow_additional_params = compile_and_fit( model=model, train=dataset['train_ds'], val=dataset['val_ds'], MAX_EPOCHS=MAX_EPOCHS, patience=patience, model_name=model_name, verbose=verbose) # Get all data props for documentation in MLflow kwargs.update(_extract_just_important_data_props(data_props)) kwargs['run'] = experiment_date_time mlflow_additional_params['kwargs'] = kwargs train_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['train_ds']))) val_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['val_ds']))) test_performance = dict( zip( model.metrics_names, evaluate_model( model=model, tf_data=dataset['test_ds'], mlflow_additional_params=mlflow_additional_params))) mlflow_additional_params['data_props'] = data_props # Only save model if close to 15% best models try: best_loss = float(trials.best_trial['result']['loss']) current_loss = min(history.history['val_loss']) if current_loss <= best_loss * (1 + 0.15): save_model = True else: save_model = False except: save_model = True mlflow_saved = my_helpers.mlflow_last_run_add_param( param_dict=mlflow_additional_params, save_model=save_model) tf.keras.backend.clear_session() return_metrics = dict(loss=val_performance['loss'], all_metrics={ 'train': train_performance, 'val': val_performance, 'test': test_performance }, status=STATUS_OK, mlflow=mlflow_saved, model_name=model_name) if return_everything: return_metrics['model'] = model return_metrics['history'] = history return return_metrics ###### Get old best model records ###### storage_file_path = os.path.join( my_helpers.get_project_directories(key='cache_dir'), 'storage_best_model.json') if not os.path.exists(storage_file_path): best_model_storage = {} else: with open(storage_file_path) as json_file: best_model_storage = json.load(json_file) ######## Search for best model ######## if redo_serach_best_model or model_name not in best_model_storage or data_props[ 'iter_step'] not in best_model_storage[model_name]: warnings.filterwarnings('ignore') trials = Trials() best = fmin(fn=_optimize_objective, space=hp_param_dict, algo=tpe.suggest, max_evals=max_serach_iterations, trials=trials, early_stop_fn=no_progress_loss(iteration_stop_count=int( max_serach_iterations / 4), percent_increase=0.025)) warnings.simplefilter('always') # getting all parameters for best model storage mlflow_best_model = trials.best_trial['result']['mlflow'] best_params = {} for key, idx in best.items(): best_params[key] = param_grid[key][idx] coef_names_ = list( data_props['look_ups']['out_lookup_col_name']['X'].keys()) coef_names_ = coef_names_ + [ col + f'_sft_{i}' for i in range(1, best_params['backlooking_window']) for col in coef_names_ ] # Saving best model to storage if model_name not in best_model_storage: best_model_storage[model_name] = {} if data_props['iter_step'] not in best_model_storage[model_name]: best_model_storage[model_name][data_props['iter_step']] = { 'best_model': { 'result': { 'loss': 10**10 } }, 'history': {} } best_model_param = dict( result={ 'loss': trials.best_trial['result']['loss'], 'all_metrics': trials.best_trial['result']['all_metrics'] }, model_name=trials.best_trial['result']['model_name'], model_id=trials.best_trial['result']['mlflow']['model_id'], run_id=experiment_date_time, input_coefs=coef_names_, path_saved_model=trials.best_trial['result']['mlflow'] ['saved_model_path'], status=trials.best_trial['result']['status'], params=best_params, data=_extract_just_important_data_props(data_props)) best_model_storage[model_name][data_props['iter_step']]['history'][ experiment_date_time] = best_model_param if trials.best_trial['result']['loss'] < best_model_storage[model_name][ data_props['iter_step']]['best_model']['result']['loss']: best_model_storage[model_name][ data_props['iter_step']]['best_model'] = best_model_param with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) else: # Get best model from storage best_model_param = best_model_storage[model_name][ data_props['iter_step']]['best_model'] ######## Get Best model again ######## best_model = tf.keras.models.load_model( best_model_param['path_saved_model']) best_model.compile(loss=tf.losses.MeanAbsoluteError(), optimizer=tf.optimizers.Adam(), metrics=[ tf.metrics.MeanAbsoluteError(), CustomMeanDirectionalAccuracy(), tf.losses.Huber(), tf.metrics.MeanAbsolutePercentageError(), tf.metrics.MeanSquaredError(), tf.metrics.MeanSquaredLogarithmicError() ]) print('Best model is:', best_model_param) out = dict(best_model_param) ####### Get examples for plotting ####### if examples is not None: example_X = examples['X'] periods = best_model_param['params']['backlooking_window'] if layer_type == 'dense': example_X = tf.data.Dataset.from_tensors( np.reshape(example_X[:, -periods:, :], (example_X.shape[0], -1))) else: example_X = tf.data.Dataset.from_tensors(example_X) out['examples_pred_y'] = best_model.predict(example_X) ###### For 1 layer dense/linear models get coef & p-values ###### if NN_max_depth == 1 and isinstance(best_model.layers[0], tf.keras.layers.Dense): # Get coefs intercept_ = best_model.layers[0].bias.numpy() coef_ = best_model.layers[0].weights[0].numpy() out['coef_'] = pd.Series( dict( zip(['intercept_'] + best_model_param['input_coefs'], intercept_.tolist() + coef_.squeeze().tolist()))) dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=True, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) # get p-values import app.d_prediction.my_custom_pvalue_calc as my_p_lib out['p_values'] = {} for data_set in ['train', 'val', 'test']: y_pred = best_model.predict(dataset[f'{data_set}_X']) y_pred = np.reshape(y_pred, (-1, 1)) try: p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'], dataset[f'{data_set}_y'], coef_, intercept_, y_pred) p_values = pd.Series( dict(zip(best_model_param['input_coefs'], p_values))) out['p_values'][data_set] = p_values except: warnings.warn( "P-Values: ValueError: Input contains infinity or nan.") out['p_values'][data_set] = pd.Series( dict( zip(best_model_param['input_coefs'], ['error'] * len(best_model_param['input_coefs'])))) out['p_values'] = pd.DataFrame(out['p_values']) ##### Get Column Feature Importance ##### if return_permutation_importances: if 'feature_importance' in best_model_param: out['feature_importance'] = best_model_param['feature_importance'] else: import eli5 from eli5.sklearn import PermutationImportance sklearn_model = KerasRegressor(build_fn=best_model) sklearn_model.model = best_model dataset = _get_prep_data( train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) out['feature_importance'] = {} for data_set in ['train', 'val']: # Calculate actual FeatureImporttance try: perm = PermutationImportance( sklearn_model, cv='prefit').fit( dataset[f'{data_set}_X'].numpy(), np.reshape(dataset[f'{data_set}_y'].numpy(), (-1, 1))) feature_importances = eli5.format_as_dataframe( eli5.explain_weights( perm, feature_names=best_model_param['input_coefs'], top=10**10)) out['feature_importance'][ data_set] = feature_importances.set_index( 'feature').to_dict() except: warnings.warn( "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')." ) if out['feature_importance'] != {}: best_model_param['feature_importance'] = out[ 'feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['best_model'][ 'feature_importance'] = out['feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['history'][experiment_date_time][ 'feature_importance'] = out['feature_importance'] with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) out['status'] = 'ok' return out
X = df.iloc[:, 3:194] Y_tmp = df.iloc[:, 0] Y = [] total_sents = len(Y_tmp) for i in range(0,total_sents): Y.append(Y_tmp[i]/total_sents) # fix random seed for reproducibility seed = 7 numpy.random.seed(seed) Y = numpy.asanyarray(Y) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) perm = PermutationImportance(pipeline, random_state=1) res = perm.fit(X_test,y_test) #ret = eli5.format_as_text(eli5.explain_weights(perm)) ret = eli5.format_as_dict(eli5.explain_weights(res)) #ret = eli5.show_weights(perm, feature_names = X.columns.tolist()) print(ret) for i in ret['feature_importances']['importances']: print(i) print('------') print(perm.feature_importances_)
def genderate_PermutationImportance(X_train, y_train, is_test=True): import eli5 from eli5.sklearn import PermutationImportance if is_test == False: # model = LGBMClassifier(**self.params).fit(X_train,y_train) model = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=2019).fit( X_train, y_train) perm_train = PermutationImportance(model, random_state=1).fit( X_train, y_train) # eli5.show_weights(perm_train,top=100,feature_names=X_train.columns.tolist()) # eli5.show_weights(perm_test,top=100,feature_names=X_test.columns.tolist()) perm_feature_importance_train = pd.concat([ pd.Series(X_train.columns), pd.Series(perm_train.feature_importances_) ], axis=1).sort_values( by=1, ascending=False) perm_feature_importance_train.columns = ['feature', 'imp'] perm_feature_importance_train = perm_feature_importance_train.reset_index( drop=True) perm_feature_importance_train.to_csv( '../data/perm_feature_importance_train.csv', index=False) if is_test == True: X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3) model = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=2019).fit( X_train, y_train) perm_train = PermutationImportance(model, random_state=1).fit( X_train, y_train) # eli5.show_weights(perm_train,top=100,feature_names=X_train.columns.tolist()) # eli5.show_weights(perm_test,top=100,feature_names=X_test.columns.tolist()) perm_feature_importance_train = pd.concat([ pd.Series(X_train.columns), pd.Series(perm_train.feature_importances_) ], axis=1).sort_values( by=1, ascending=False) perm_feature_importance_train.columns = ['feature', 'imp'] perm_feature_importance_train = perm_feature_importance_train.reset_index( drop=True) perm_feature_importance_train.to_csv( './data/perm_feature_importance_train.csv', index=False) perm_test = PermutationImportance(model, random_state=1).fit(X_test, y_test) perm_feature_importance_test = pd.concat([ pd.Series(X_test.columns), pd.Series(perm_test.feature_importances_) ], axis=1).sort_values( by=1, ascending=False) perm_feature_importance_test.columns = ['feature', 'imp'] perm_feature_importance_test = perm_feature_importance_test.reset_index( drop=True) perm_feature_importance_test.to_csv( './data/perm_feature_importance_test.csv', index=False)
model_predictor = Rand_forest.named_steps['randomforestclassifier'] Rand_pipeline = make_pipeline( OrdinalEncoder(), SimpleImputer(strategy='median')) # fit the model Rand_pipeline.fit(X_train, y_train) # transform the model TT_val = Rand_pipeline.transform(X_val) model_permuter = PermutationImportance( model_predictor, scoring='accuracy', n_iter=7, random_state=42 ) model_permuter.fit(TT_val, y_val); # eli5 graph with weight and feature with my 14 selecting features eli5.show_weights( model_permuter, top=None, feature_names=X_val.columns.tolist() ) """### Model Interpretation ### Isolated Partial Dependence Plots with 1 feature
(RobustScaler(), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (SelectKBest(selection_score_func, k=1), ['<NAME3>']), (SelectKBest(selection_score_func, k=2), ['<NAME2>', '<NAME3>']), (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)), ('p', SelectPercentile(selection_score_func, 30)) ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']), (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (VarianceThreshold(1.0), ['<NAME2>']), (GenericUnivariateSelect(), ['<NAME2>']), (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']), (SelectFromModel(LogisticRegression( 'l1', C=0.01, random_state=42)), ['<NAME0>', '<NAME2>']), (SelectFromModel( PermutationImportance( LogisticRegression(random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['<NAME2>', '<NAME3>']), (RFE(LogisticRegression(random_state=42), 2), ['<NAME1>', '<NAME3>']), (RFECV(LogisticRegression(random_state=42)), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (RandomizedLogisticRegression(random_state=42), ['<NAME1>', '<NAME2>', '<NAME3>']), ]) def test_transform_feature_names_iris(transformer, expected, iris_train): X, y, _, _ = iris_train transformer.fit(X, y) # Test in_names being provided res = transform_feature_names(transformer,
# https://medium.com/@hupinwei/%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E5%8F%AF%E8%A7%A3%E9%87%8B%E6%80%A7-machine-learning-explainability-%E7%AC%AC%E4%BA%8C%E8%AC%9B-c090149f0772 import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier # get data data = pd.read_csv( '../input/fifa-2018-match-statistics/FIFA 2018 Statistics.csv') y = (data['Man of the Match'] == "Yes" ) # Convert from string "Yes"/"No" to binary feature_names = [i for i in data.columns if data[i].dtype in [np.int64]] X = data[feature_names] # seperate train and validate set train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # setup model and fit my_model = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X, train_y) # special package to test import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names=val_X.columns.tolist())
# svc = SVC(kernel="linear") # rfecv = RFECV(estimator=svc) # rfecv.fit(X, y) # y_pos = np.arange(len(X.columns)) # plt.bar(y_pos, rfecv.grid_scores_, color=(0.2, 0.4, 0.6, 0.6)) # plt.ylim(0.0, 1.0) # plt.xlabel('Number of features selected') # plt.ylabel('Cross validation score (nb of correct classifications)') # plt.title('Feature Analisys') # plt.draw() # Use feature importance def build_model(): return base_model(input_dim=len(X.columns)) # evaluate model with standardized dataset LENTOO # estimator = KerasClassifier(build_fn=build_model, epochs=100, verbose=0) # kfold = StratifiedKFold(n_splits=10, shuffle=True) # results = cross_val_score(estimator, features, rpta, cv=kfold) # print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) model = KerasClassifier(build_fn=build_model) model.fit(X, y, epochs=50, batch_size=128) perm = PermutationImportance(model, random_state=1).fit(X, y) print(eli5.show_weights(perm, feature_names=X.columns.tolist()).data) plt.show()
def NN_train(filetrain, targetname, setname): def warn(*args, **kwargs): pass import warnings warnings.warn = warn import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") import numpy as np import pandas as pd import matplotlib.pyplot as plt import os from scipy import stats from eli5.sklearn import PermutationImportance #get feature importance per K-fold import pickle # from sklearn.externals import joblib #ML functions from sklearn.neural_network import MLPRegressor from sklearn.model_selection import KFold, GridSearchCV from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score import sklearn.preprocessing as skpp #%% Import data dfp = filetrain print(setname + ' column list:') print(dfp.columns) print('-----------------------------------') print('Data imported') dfp = dfp[dfp['M500c(41)'] > 13.5] dfp = dfp.dropna(axis=1) print('New column list:') # targetname=['G3XMgas(80)','G3XMstar(81)' ,'G3XTgas_mw(82)', 'G3XYx(84)', 'G3XYsz(85)'] dfp.reset_index(drop=True, inplace=True) Y_train = dfp[targetname] dfptrain = dfp.copy() dfptrain.drop(labels=targetname, inplace=True, axis=1) coldrop = [col for col in dfptrain.columns if 'G3X' in col] dfptrain.drop(labels=coldrop, inplace=True, axis=1) new_col_list = dfptrain.columns print(new_col_list) #plot hist plt.figure('Mass hist') plt.hist(dfp['M500c(41)'], bins=20, zorder=1, label=[setname + ' set']) plt.legend() #%% Preprocessing of data # Y_train=np.log10(Y_train) # Ysz_error=Y_train.index[Y_train['G3XYsz(85)'] == -np.inf] # print('This are the index of log10(Ysz)=-inf') # print(Ysz_error) # # Y_train.drop(labels=Ysz_error, axis=0, inplace=True) # dfptrain.drop(labels=Ysz_error, axis=0, inplace=True) #statistical data from Y_train # mu=np.mean(Y_train) #median # sigma=np.std(Y_train) #standard deviation #%% Analysis of train data #we add back target data for correlation analysis #dfptrain= dfptrain.copy() dfptrain[targetname] = Y_train corr = dfptrain.corr() plt.figure('Correlation matrix - training data', figsize=(9, 9)) nticks = len(dfptrain.columns) plt.xticks(range(nticks), dfptrain.columns, rotation='vertical') plt.yticks(range(nticks), dfptrain.columns) _ = plt.colorbar( plt.imshow(corr, interpolation='nearest', vmin=-1., vmax=1., cmap=plt.get_cmap('YlOrBr'))) plt.title('Correlation matrix - Training data', fontsize=20) #plt.savefig('plots/correlation/correlation_plot.png') # plt.show() #%% NN on training data - Creation of NN algorithm #We get the test/train index indexFolds = KFold(n_splits=5, shuffle=True, random_state=11) lVarsTarg = dfptrain.columns R2_NN = [] MAE_NN = [] MSE_NN = [] tuned_parameters = [ # {'hidden_layer_sizes' : [(300,200,100)], { 'hidden_layer_sizes': [(20, 20, 20)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs'] } # 'solver' : ['lbfgs', 'sgd', 'adam']} ] # OG Layer size : [(300,200,100)] # Recorremos las particiones ind = 0 Ypred = np.zeros(np.shape(dfptrain[targetname])) Ytarg = np.zeros(np.shape(dfptrain[targetname])) Feature_mean = np.zeros([ new_col_list.shape[0], ]) if len(targetname) == 1: Ypred = np.ravel(Ypred) Ytarg = np.ravel(Ytarg) for idxTr, idxTs in indexFolds.split(dfptrain): ind = ind + 1 print() print() print('K-fold:', ind) #Making Min-Max Scaler Scaler = skpp.MinMaxScaler() X = dfptrain.drop(labels=targetname, axis=1) print(X.columns) print(X.columns.shape) Scaler.fit(X) #Fit scaler to data, then transform y_min = dfptrain[targetname].min(axis=0) y_max = dfptrain[targetname].max(axis=0) #stat data for inv transform print('y_min:') print(y_min) print('y_max:') print(y_max) ''' dfp_scaled= (dfp - dfp.min(axis=0)) / (dfp.max(axis=0) - dfp.min(axis=0)) dfp_inv= dfp_scaled * (dfp.max(axis=0) - dfp.min(axis=0)) + dfp.min(axis=0) y_min=dfp.min(axis=0) y_max=dfp.max(axis=0) ''' dfptrain_old = dfptrain.copy() #backup # dfptrain_scaled=(dfptrain - dfptrain.min(axis=0)) / (dfptrain.max(axis=0) - dfptrain.min(axis=0)) dfptrain_X = Scaler.transform(X) dfptrain_X = pd.DataFrame(dfptrain_X, columns=X.columns) Y = dfptrain[targetname] dfptrain_Y = (Y - Y.min(axis=0)) / (Y.max(axis=0) - Y.min(axis=0)) # dfptrain_scaled=pd.DataFrame(dfptrain_scaled, columns= dfptrain_old.columns) print('Scaling done') #Separamos la informacion entre entrenamiento y testeo # X_train = dfptrain_X.values[idxTr,:-len(targetname)] # Y_train = dfptrain_Y.values[idxTr,-len(targetname):] # X_test = dfptrain_X.values[idxTs,:-len(targetname)] # Y_test = dfptrain_Y.values[idxTs,-len(targetname):] X_train = dfptrain_X.values[idxTr, :] Y_train = dfptrain_Y.values[idxTr, :] X_test = dfptrain_X.values[idxTs, :] Y_test = dfptrain_Y.values[idxTs, :] if len(targetname) == 1: Y_train = dfptrain_Y.values[idxTr, -len(targetname)] Y_test = dfptrain_Y.values[idxTs, -len(targetname)] #Estandariza la informacion quitando la media y escalando a la unidad # norm_train = skpp.StandardScaler().fit(X_train) #Normal L2 transform # norm_train = skpp.PowerTransformer().fit(X_train) #Power transform to gaussian like # X_train = skpp.StandardScaler().fit_transform(X_train) #Normal L2 transform # X_train = skpp.PowerTransformer().fit_transform(X_train) #Power transform to gaussian like #Transform back into dataframe X_train = pd.DataFrame(X_train, columns=X.columns) Y_train = pd.DataFrame(Y_train, columns=targetname) X_test = pd.DataFrame(X_test, columns=X.columns) Y_test = pd.DataFrame(Y_test, columns=targetname) print('Sets ready') #GRID SEARCH ON NEURAL NETWORK clf_bp = GridSearchCV(MLPRegressor(max_iter=500), tuned_parameters, cv=5, n_jobs=-1) #multitasking clf_bp.fit(X_train, Y_train) print(clf_bp.best_params_) hidden_layer_sizes = clf_bp.best_params_[ 'hidden_layer_sizes'] #nos da el mejor parametro activation = clf_bp.best_params_['activation'] solver = clf_bp.best_params_['solver'] clf = MLPRegressor( hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver ) #creamos el neural network regressor con ese parametro clf.fit(X_train, Y_train) score = clf.score( X_train, Y_train ) #obtenemos el score con los datos de training asociado con ese parametro # feature_importances.append(clf.feature_importances_) #guardamos la importancia de cara parametro en la simulacion perm = PermutationImportance(clf).fit(X_test, Y_test) perm_weight = perm.feature_importances_ print(perm_weight.shape) feature_imp = {'Feature': X_train.columns, 'Importance': perm_weight} feature_imp = pd.DataFrame(feature_imp) feature_imp_name = 'feature_importance-kfold_' + str(ind) + '-' + str( setname) + '.csv' Feature_mean += perm_weight / 5 #feature_imp.to_csv(feature_imp_name) #export feature importance of dataframe in kfold ind for setname print("Score training = ", score) score_t = clf.score(X_test, Y_test) # score del test print("Score test = ", score_t) y_pred = clf.predict( X_test) #(dfP.values[:,:-1]) #obtenemos las predicciones de X_test y_target = Y_test #dfP.values[:,-1] #Y_tot if len(targetname) == 1: y_target = np.ravel(y_target) Ypred[idxTs, ] = y_pred Ytarg[idxTs, ] = y_target print('MSE = ', (mean_squared_error(y_target, y_pred))) print('MAE = ', (mean_absolute_error(y_target, y_pred))) print('R^2 score =', (r2_score(y_target, y_pred))) #Guardamos estos resultados MSE_NN.append(mean_squared_error(y_target, y_pred)) MAE_NN.append(mean_absolute_error(y_target, y_pred)) R2_NN.append(r2_score(y_target, y_pred)) print() print() MSE_NN = np.array(MSE_NN) MAE_NN = np.array(MAE_NN) R2_NN = np.array(R2_NN) print('MSE - 5 Folds : ', MSE_NN.mean()) print('MAE - 5 Folds : ', MAE_NN.mean()) print('R^2 - 5 Folds : ', R2_NN.mean()) # Feature_mean= Feature_mean.mean(axis=0, skipna=True) print(Feature_mean) Feature_name = 'Feature importance for NN' + str(targetname) + '.csv' Features = {'Name': X_train.columns, 'Weight': Feature_mean} Features = pd.DataFrame(Features) Features.to_csv(Feature_name) print('Feature importance exported') Ypred_NN = pd.DataFrame(Ypred, columns=targetname) Ytarg_NN = pd.DataFrame(Ytarg, columns=targetname) #%% for target in targetname: Ypred_NN[target] = Ypred_NN[target] * (y_max[target] - y_min[target]) + y_min[target] Ytarg_NN[target] = Ytarg_NN[target] * (y_max[target] - y_min[target]) + y_min[target] #name=str(targetname)+'data.pickle' #with open(name, 'wb') as f: # pickle.dump([Ypred_RF, Ytarg_RF], f) name = 'NN' + str(targetname) + setname + 'monotargetV7.pickle' pickle.dump(clf, open('saved_models/' + name, 'wb'), protocol=2) #Export NN algorithm # joblib.dump(clf,name) # pickle.dump(norm_train, open('normalicer'+name, 'wb'), protocol=2) #normalicer for data pickle.dump(Scaler, open('saved_models/normalicer' + name, 'wb'), protocol=2) #normalicer for data pickle.dump([y_max, y_min], open('saved_models/statdata' + name, 'wb'), protocol=2) #with open(name, 'wb') as f: # pickle.dump(clf, f) #target_NN=['G3XMgas_NN(80)','G3XMstar_NN(81)' ,'G3XTgas_mw_NN(82)', 'G3XYx_NN(84)', 'G3XYsz_NN(85)'] Y_NN = pd.DataFrame(data=Ypred_NN.values, columns=targetname) dfptrain_old = pd.concat([dfptrain_old, Y_NN], axis=1) dfptrain_old_name = 'NN_' + setname + 'V7' dfptrain_old.to_csv(dfptrain_old_name) print('Data exported') #%% training plotting # f1=plt.figure('Mgas NN'+ setname) # plt.scatter(Ytarg_NN['G3XMgas(80)'].values, Ypred_NN['G3XMgas(80)'].values,marker='o', s=(72./f1.dpi)**2,lw=0) # plt.plot(np.linspace(min(Ytarg_NN['G3XMgas(80)'].values), max(Ytarg_NN['G3XMgas(80)'].values)), \ # np.linspace(min(Ytarg_NN['G3XMgas(80)'].values), max(Ytarg_NN['G3XMgas(80)'].values)), '-r' ) # plt.title('Mgas - NN vs real '+ setname) # plt.xlabel('Mgas real - log scale') # plt.ylabel('Mgas NN - log scale') # f1.savefig('Mgas NN'+ setname + ".pdf", bbox_inches='tight') # plt.close() # # f2=plt.figure('Mstar NN'+ setname) # plt.scatter(Ytarg_NN['G3XMstar(81)'].values, Ypred_NN['G3XMstar(81)'].values,marker='o', s=(72./f2.dpi)**2,lw=0) # plt.plot(np.linspace(min(Ytarg_NN['G3XMstar(81)'].values), max(Ytarg_NN['G3XMstar(81)'].values)), \ # np.linspace(min(Ytarg_NN['G3XMstar(81)'].values), max(Ytarg_NN['G3XMstar(81)'].values)), '-r' ) # plt.title('Mstar - NN vs real ' + setname) # plt.xlabel('Mstar real - log scale') # plt.ylabel('Mstar NN - log scale') # f2.savefig('Mstar NN'+ setname+ ".pdf", bbox_inches='tight') # plt.close() # # f3=plt.figure('Tgas NN'+ setname) # plt.scatter(Ytarg_NN['G3XTgas_mw(82)'].values, Ypred_NN['G3XTgas_mw(82)'].values,marker='o', s=(72./f3.dpi)**2,lw=0) # plt.plot(np.linspace(min(Ytarg_NN['G3XTgas_mw(82)'].values), max(Ytarg_NN['G3XTgas_mw(82)'].values)), \ # np.linspace(min(Ytarg_NN['G3XTgas_mw(82)'].values), max(Ytarg_NN['G3XTgas_mw(82)'].values)), '-r' ) # plt.title('Tgas - NN vs real ' + setname) # plt.xlabel('Tgas real - log scale') # plt.ylabel('Tgas NN - log scale') # f3.savefig('Tgas NN'+ setname+ ".pdf", bbox_inches='tight') # plt.close() # # f4=plt.figure('G3XYx NN'+ setname) # plt.scatter(Ytarg_NN['G3XYx(84)'].values, Ypred_NN['G3XYx(84)'].values,marker='o', s=(72./f4.dpi)**2,lw=0) # plt.plot(np.linspace(min(Ytarg_NN['G3XYx(84)'].values), max(Ytarg_NN['G3XYx(84)'].values)), \ # np.linspace(min(Ytarg_NN['G3XYx(84)'].values), max(Ytarg_NN['G3XYx(84)'].values)), '-r' ) # plt.title('Yx - NN vs real ' + setname) # plt.xlabel('G3XYx real - log scale') # plt.ylabel('G3XYx NN - log scale') # f4.savefig('G3XYx NN'+ setname+ ".pdf", bbox_inches='tight') # plt.close() # # f5=plt.figure('G3XYsz NN'+setname) # plt.scatter(Ytarg_NN['G3XYsz(85)'].values, Ypred_NN['G3XYsz(85)'].values,marker='o', s=(72./f5.dpi)**2,lw=0) # plt.plot(np.linspace(min(Ytarg_NN['G3XYsz(85)'].values), max(Ytarg_NN['G3XYsz(85)'].values)), \ # np.linspace(min(Ytarg_NN['G3XYsz(85)'].values), max(Ytarg_NN['G3XYsz(85)'].values)), '-r' ) # plt.title('Yx - NN vs real '+setname) # plt.xlabel('G3XYsz real - log scale') # plt.ylabel('G3XYsz NN - log scale') # f5.savefig('G3XYsz NN'+setname+ ".pdf", bbox_inches='tight') # plt.close() return
epochs=200, verbose=0) #Change dataallwo to dataall to include census data print("NN Average RMSE: ", np.average(history.history['loss'])) print("NN Average Normalized RMSE: ", np.average(history.history['loss']) / (max(yLabels) - min(yLabels))) #%% #Model evaluation evaltest = model.evaluate(x, y, batch_size=1) #print('Accuracy: %.2f' % (accuracy*100)) print(evaltest) #%% #Plot loss over epochs plt.plot(history.history['val_loss']) plt.show() #%% #Not used in preliminary results permut = PermutationImportance(model, scoring="accuracy").fit(testdata, yLabels) eli5.show_weights(permut, feature_names=dataall.columns.tolist())
'random_state': [0], } # Instantiate the grid search model hyperp_srch = GridSearchCV(estimator=rf_model, param_grid=group_param, cv=5, return_train_score=False) hyperp_srch.fit(x_train, y_train) #print(hyperp_srch.best_params_) best_hyper = hyperp_srch.best_estimator_ rf_model = RandomForestClassifier(**best_hyper.get_params()) rf_model.fit(x_train, y_train) y_pred_train = rf_model.predict(x_train) y_pred_val = rf_model.predict(x_val) ## End print('Classification Report: \n') print(classification_report(y_val, y_pred_val)) print('\nConfusion Matrix: \n') print(confusion_matrix(y_val, y_pred_val)) permutation = PermutationImportance(rf_model, random_state=2).fit(x_train, y_train) eli5.explain_weights(permutation, feature_names=x.columns.tolist()) print( eli5.format_as_text( eli5.explain_weights(permutation, feature_names=x.columns.tolist())))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) # Define the model. regressor = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=12, max_samples=None) # Fit to the data. regressor.fit(x_train, y_train) # Print both types of feature importance. perm = PermutationImportance(regressor, random_state=42).fit(x_test, y_test) print("Feature importances using permutation", perm.feature_importances_) print("Feature importances using MDI ", regressor.feature_importances_) # Calculate the average percentage error. predictions = regressor.predict(x_test) mean_perc_error = np.average((np.abs(y_test - predictions) * 100 / y_test)) print("Average percentage error ", mean_perc_error) all_predictions = regressor.predict(x) data["predictions"] = all_predictions # select one speed for plotting speed = 3000 test = data[data.rpm == speed]
test_size=0.2, random_state=times) x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) total_predict = np.zeros(len(y_test)) for i in range(len(MLA)): skf = StratifiedKFold(n_splits=5, random_state=times) clf = copy.deepcopy(MLA[i]) clf.random_state = times sel = SelectFromModel( PermutationImportance(clf, cv=skf, random_state=times)).fit(x_train, y_train) x_train_trans = sel.transform(x_train) x_test_trans = sel.transform(x_test) vali_auc = np.mean( cross_val_score(clf, x_train_trans, y_train, cv=skf, scoring='roc_auc')) clf.fit(x_train_trans, y_train) predict_result = clf.predict_proba(x_test_trans)[:, 1] total_predict += predict_result test_auc = roc_auc_score(y_test, predict_result)
# num_leaves=13, # max_depth=5, # learning_rate=0.01, # min_split_gain=0, # min_child_samples=2, # colsample_bytree=0.4, # objective='binary', # random_state=42, # eval_metric='roc_auc', # n_jobs=-1) shuffle_verify(X, y, lgb_clf) # Permutation Importance perm = PermutationImportance(lgb_clf, random_state=42).fit(test_X, test_y) eli5.show_weights(perm, feature_names=feature_names) # Partial Dependence PLots - outliers make it difficult to see. def pdp_plotter(feature, model): pdp_feat = pdp.pdp_isolate(model=lgb_clf, dataset=test_X, model_features=feature_names, feature=feature) pdp.pdp_plot(pdp_feat, feature) plt.show() pdp_plotter('service_to_uza_area', lgb_clf)
rmse.append(np.sqrt(mean_squared_error(val_pred,Y_val))) r2 = [] r2.append(r2_score(val_pred,Y_val)) d={'RMSE':rmse} d1={'R2': r2} print(d,d1) # #### This model is pretty good since we have an R squared value close to 1 and very low RMSE value but lets try to optimize it # In[80]: import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(model, random_state=1).fit(X_train, Y_train) eli5.show_weights(perm, feature_names = X_train.columns.tolist()) # In[ ]: # Here we can see that the features that have the biggest impact of predicting the GDP per capita value are "Population" and "HDI" # So we are going to take those two features now for our new model # In[81]: X_new = df_std[["POP", "HDI"]]
# Performance best_adj_r2 = adj_r2(best_r2, n, p) r2_dict[model_filename] = best_adj_r2 # Permutation Importance X_data = pd.read_csv(data_folder / 'X_varGroup{}.csv'.format(model_filename[-8]), index_col='t10_cen_uid_u_2010', dtype={'t10_cen_uid_u_2010': object}) y_data = pd.read_csv(data_folder / 'y_{}.csv'.format(model_filename[-6:-4]), index_col='t10_cen_uid_u_2010', dtype={'t10_cen_uid_u_2010': object}, squeeze=True) perm = PermutationImportance(pipe, scoring='r2') \ .fit(X_data.values, y_data.values, cv='prefit') perm_results = np.mean(np.array(perm.results_), axis=0) perm_df = pd.DataFrame({ # 'feature': [x[8:] for x in colnames[model_filename[-8]]], 'feature': X_data.columns.tolist(), 'importance': perm_results }) \ .sort_values('importance', ascending=False) \ .set_index('feature') # Coefficients features_final = pipe.named_steps['poly'].get_feature_names( colnames[model_filename[-8]]) coef_df = pd.DataFrame.from_dict( {
max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=4, min_samples_split=2, min_weight_fraction_leaf=0, n_estimators=29, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) model1.fit(X_train_transformed, y_train) # Get permutation importances ! pip install eli5 from eli5.sklearn import PermutationImportance import eli5 permuter = PermutationImportance( model1, scoring='r2', n_iter=2, random_state=42 ) permuter.fit(X_val_transformed, y_val) feature_names = X_val.columns.tolist() eli5.show_weights( permuter, top=None, # show permutation importances for all features feature_names=feature_names ) from sklearn.metrics import mean_squared_error, r2_score # Coefficient of determination r2 for the training set
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) return model #sk_params=[] batch_size=32 nb_epoch=20 #my_model = KerasClassifier(build_fn=base_model) classifier = KerasClassifier(build_fn = base_model,validation_split=0.2,batch_size=batch_size,shuffle=True,epochs=nb_epoch,verbose=1,callbacks=callbacks_list) classifier.fit(X_train, y_train) perm = PermutationImportance(classifier, random_state=1,n_iter=1).fit(X_test, y_test) eli5.show_weights(perm, feature_names = ['TantraLines', 'TantraHits', 'Mestimator', 'TantraZenith', 'TantraAzimuth', 'TantraAngularEstimator', 'TantraX', 'TantraY', 'TantraZ', 'Lambda', 'Beta', 'TrackLength', 'TantraEnergy', 'TantraRho', 'IntegralCharge', 'MeanCharge', 'StdCharge', 'TriggerCounter', 'GridQuality', 'AAZenith', 'AAAzimuth', 'Trigger3N', 'TriggerT3', 'NOnTime']) perm_train_feat_imp_df = pd.DataFrame({'val': perm.results_[0], 'lab':['TantraLines', 'TantraHits', 'Mestimator', 'TantraZenith', 'TantraAzimuth', 'TantraAngularEstimator', 'TantraX', 'TantraY', 'TantraZ', 'Lambda', 'Beta', 'TrackLength', 'TantraEnergy', 'TantraRho', 'IntegralCharge', 'MeanCharge', 'StdCharge', 'TriggerCounter', 'GridQuality', 'AAZenith', 'AAAzimuth', 'Trigger3N', 'TriggerT3','NOnTime'] } ) perm_train_feat_imp_df.plot.barh(x='lab', y='val')