def adaboost_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): ''' AdaBoostRegressor from scikitlearn. ''' if learn_options['adaboost_version'] == 'python': if not learn_options['adaboost_CV']: clf = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], learning_rate=learn_options['adaboost_learning_rate'], n_estimators=learn_options['adaboost_n_estimators'], alpha=learn_options['adaboost_alpha'], subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=learn_options['adaboost_max_depth'], init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf.fit(X[train], y[train].flatten()) y_pred = clf.predict(X[test])[:, None] else: print "Adaboost with GridSearch" from sklearn.model_selection import GridSearchCV param_grid = { 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 5, 6, 7], 'min_samples_leaf': [5, 7, 10, 12, 15], 'max_features': [1.0, 0.5, 0.3, 0.1] } label_encoder = sklearn.preprocessing.LabelEncoder() label_encoder.fit(y_all['Target gene'].values[train]) gene_classes = label_encoder.transform( y_all['Target gene'].values[train]) n_folds = len(np.unique(gene_classes)) cv = sklearn.model_selection.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True) est = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], n_estimators=learn_options['adaboost_n_estimators']) clf = GridSearchCV(est, param_grid, n_jobs=20, verbose=1, cv=cv, scoring=spearman_scoring, iid=False).fit(X[train], y[train].flatten()) print clf.best_params_ y_pred = clf.predict(X[test])[:, None] else: raise NotImplementedError return y_pred, clf
def main(model='mlp', num_epochs=500, dataset='reuters', folder="", exp_start_time=None): # Load the dataset print("Loading data...") if dataset== 'boston': from sklearn import cross_validation from sklearn import preprocessing from sklearn import datasets #from sklearn.utils import shuffle boston = datasets.load_boston() X, y = boston.data.astype('float32'), boston.target.astype('float32') #X, y = shuffle(boston.data, boston.target, random_state=13) scaler = preprocessing.StandardScaler() X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.1, random_state=42) #X_train = scaler.fit_transform(X_train) X_val = X_train.copy() y_val = y_train.copy() print("validation is just a copy of X_train, so results will be similar but with no drop out") from sklearn import ensemble from sklearn.metrics import mean_squared_error params = {'n_estimators': 150, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} clf = ensemble.GradientBoostingRegressor(**params) clf.fit(X_train, y_train) mse_train = mean_squared_error(y_train, clf.predict(X_train)) mse_test = mean_squared_error(y_test, clf.predict(X_test)) print("GRAD BOOST MSE train: %.4f" % mse_train) print("GRAD BOOST MSE test: %.4f" % mse_test) input_var = T.fmatrix('inputs') target_var = T.fvector('targets') input_shape= (None, X_train.shape[1]) output_shape = 1 batch_num = 128 regress= True batch_norm =False print(output_shape) network = build_network_model(model, input_var, input_shape, output_shape, batch_norm=batch_norm, regress=regress) print(network.output_shape) train_fn, eval_fn, LR_params = build_functions(network, input_var, target_var, regress=regress) elif dataset.startswith('reuters'): X_train, y_train, X_val, y_val, X_test, y_test =load_dataset_reuters('../datasets/reuters/') print ("Train: ",X_train.shape, "Val: ", X_val.shape, "Test: ",X_test.shape) input_var = T.fmatrix('inputs') target_var = T.fmatrix('targets') input_shape= (None, X_train.shape[1]) output_shape = y_train.shape[1] regress = False batch_norm = True # from sklearn import ensemble # from sklearn.metrics import accuracy_score # params = {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2} # clf = ensemble.RandomForestClassifier(**params) # # clf.fit(X_train, y_train) # mse_train = accuracy_score(y_train, clf.predict(X_train)) # mse_test = accuracy_score(y_test, clf.predict(X_test)) # print("RANDOM train: %2.4f" % mse_train) # print("RANDOM test: %2.4f" % mse_test) # # from sklearn import neural_network # clf = neural_network.MLPClassifier(hidden_layer_sizes=(150,), # activation='relu', # solver='sgd', batch_size=16, # learning_rate='constant', # learning_rate_init=0.001, # max_iter=250, # early_stopping=True, # shuffle=True) # clf.fit(X_train, y_train) # mse_train = accuracy_score(y_train, clf.predict(X_train)) # mse_test = accuracy_score(y_test, clf.predict(X_test)) # print("KNN train: %2.4f" % mse_train) # print("KNN test: %2.4f" % mse_test) # import pdb # pdb.set_trace() batch_num = 16 network = build_network_model(model, input_var, input_shape, output_shape, batch_norm=batch_norm, regress=regress) print(network.output_shape) train_fn, eval_fn, LR_params = build_functions(network, input_var, target_var, regress=regress) # Prepare Theano variables for inputs and targets print("input shape:", input_shape) val_acc_list =[] tst_acc_list =[] val_err_list = [] trn_err_list = [] print("Model", model) if model.startswith("mlp:"): lr_all = 5e-4 else: lr_all = 1e-4 #reuters best 1e-4 for focused doesnt change 5e-5 lr_all_decay = .9 lr_mu = 0.001 lr_mu_decay = 0.9 lr_si = 0.001 lr_si_decay = 0.9 lr_fw = 0.001 lr_fw_decay = .9 decay_epoch = 30 print_int = 10 if dataset=='boston': lr_all = 0.005 lr_all_decay = .9 lr_mu = 0.001 lr_mu_decay = 0.9 lr_si = 0.001 lr_si_decay = 0.9 lr_fw = 0.005 lr_fw_decay = .9 decay_epoch = 1000 print_int = 1000 set_params_value(LR_params,[lr_all,lr_mu,lr_si,lr_fw]) for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() if (epoch>1 and epoch%decay_epoch==1): #lr_all = 0.001 #lr_fw = 0.001 lr_all = lr_all * lr_all_decay lr_mu = lr_mu * lr_mu_decay lr_si = lr_si * lr_si_decay lr_fw = lr_fw * lr_fw_decay set_params_value(LR_params,[lr_all,lr_mu,lr_si,lr_fw]) for batch in iterate_minibatches(X_train, y_train, batch_num, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 trn_err_list.append(train_err/train_batches) # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, y_val.shape[0], shuffle=False): inputs, targets = batch err, acc = eval_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 train_err = train_err / train_batches val_err = val_err / val_batches val_acc = val_acc / val_batches * 100 val_err_list.append(val_err) # Then we print the results for this epoch: if (epoch%print_int==0): print("Model {} Epoch {} of {} took {:.3f}s".format(model, epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:2.6f}".format(train_err)) print(" validation loss:\t\t{:2.6f}".format(val_err)) if not regress: print(" validation accuracy:\t\t{:2.4f} %".format(val_acc)) else: val_acc = 1-val_err print_param_stats(network) #debug_focus_vars(network) if np.isnan(train_err): print("Train error NAN") break tst_err, tst_acc = eval_fn(X_test, y_test) if not regress: tst_acc_list.append(tst_acc * 100) # to pick the tst error at best val accuracy. val_acc_list.append(val_acc) else: tst_acc_list.append(tst_err) # to pick the tst error at best val accuracy. val_acc_list.append(val_err) # After training, we compute and print the test error: val_ac_np = np.asarray(val_acc_list) if regress: best_val = np.argmin(val_ac_np) else: best_val = np.argmax(val_ac_np) if np.isnan(train_err): return tst_err_fin, tst_acc_fin = eval_fn(X_test, y_test) print("\nFinal results:") print(" test loss:\t\t\t{:.6f}".format(tst_err_fin)) print(" test accuracy:\t\t{:.4f} %".format(tst_acc_fin)) print("\nTest result at best val epoch: ", best_val) print(" test accuracy:\t\t{:.4f} %".format(tst_acc_list[best_val])) best_test_early_stop = tst_acc_list[best_val] from datetime import datetime now = datetime.now() timestr = now.strftime("%Y%m%d-%H%M%S") print("_result_change") print(start_time, timestr) filename= str(folder+dataset+"_result_"+model+"_"+exp_start_time+"_"+timestr) np.savez(filename,(trn_err_list, val_err_list, val_acc_list, tst_err_fin, tst_acc_fin*100, tst_acc_list, best_test_early_stop)) # save model and code filename= str(folder+dataset+"_model_"+model+"_"+timestr) fixed_params = lasagne.layers.get_all_params(network, trainable=False) fixed_params =[t.name for t in fixed_params] trn_params = lasagne.layers.get_all_params(network, trainable=True) trn_params =[t.name for t in trn_params] fixed_param_values = lasagne.layers.get_all_param_values(network, trainable=False) trn_param_values = lasagne.layers.get_all_param_values(network, trainable=True) np.savez(filename, trn_params, trn_param_values, fixed_params, fixed_param_values) plt_figures = False if plt_figures: import matplotlib.pyplot as plt plt.plot(trn_err_list) plt.plot(val_err_list) plt.ylim([0, 0.25]) plt.title("Train and Validation Error") plt.legend(("Train","Validate")) plt.show()
print "\n\n\n mean err ", s.mean() #============================================================================== # GBoost #============================================================================== # Fit regression model params = { 'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls', 'verbose': 1 } clf = ensemble.GradientBoostingRegressor(**params) i = 0 clf.fit(X_train, Y_train[:, i]) mse = mean_squared_error(Y_valid[:, i], clf.predict(X_valid)) print("MSE: %.4f" % mse) #============================================================================== # grid searching #============================================================================== #param_grid ={"kernel": ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed')} # #params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, # 'learning_rate': 0.01, 'loss': 'ls'} # #grid_search = GridSearchCV(sup_vec, param_grid=param_grid) #start = time()
def eval_bootstrap(df, features, md): X = df[features].values y = df[LABEL_COLUMN_NAME].values aa = [] bb = [] cc = [] dd = [] for i in range(1, 5): a = [] b = [] c = [] d = [] cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=i) for (train, val) in cv.split(X, y): if md == 1: regressor = ensemble.GradientBoostingRegressor( n_estimators=30, max_depth=4, min_samples_split=2, learning_rate=0.1, loss='ls', random_state=RANDOM_STATE) elif md == 2: regressor = ensemble.RandomForestRegressor( n_estimators=30, max_depth=10, min_samples_split=4, random_state=RANDOM_STATE) elif md == 3: regressor = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1) elif md == 4: regressor = MLPRegressor(hidden_layer_sizes=( 20, 30, 30, 5, ), batch_size=10, activation='relu', random_state=RANDOM_STATE) elif md == 5: regressor = LinearRegression() elif md == 6: regressor = Lasso(alpha=0.1, random_state=RANDOM_STATE) regressor = regressor.fit(X[train], y[train]) pred = regressor.predict(X[val]) rmse = np.sqrt(np.mean((pred - y[val])**2)) mae = mean_absolute_error(pred, y[val]) r2 = r2_score(pred, y[val]) a.insert(len(a), rmse) b.insert(len(b), mae) c.insert(len(c), r2) aa.append(np.mean(a)) bb.append(np.mean(b)) cc.append(np.mean(c)) return np.mean(aa), np.mean(bb), np.mean(cc)
model_random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=20) # 使用20个决策树 ModelList.append([model_random_forest_regressor,'随机森林回归']) # 6.Adaboost回归 from sklearn import ensemble model_adaboost_regressor = ensemble.AdaBoostRegressor(n_estimators=50) # 这里使用50个决策树 ModelList.append([model_adaboost_regressor,'Adaboost回归']) # 7.GBRT回归 from sklearn import ensemble model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor(n_estimators=100) # 这里使用100个决策树 ModelList.append([model_gradient_boosting_regressor,'GBRT回归']) # 8.Bagging回归 from sklearn import ensemble model_bagging_regressor = ensemble.BaggingRegressor() ModelList.append([model_bagging_regressor,'Bagging回归']) # 9.ExtraTree极端随机数回归 from sklearn.tree import ExtraTreeRegressor model_extra_tree_regressor = ExtraTreeRegressor() ModelList.append([model_extra_tree_regressor,'ExtraTree极端随机数回归'])
# data = boston.data # data = data / data.max(axis=0) x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2) median = np.median(x_train, axis=0) mad = np.median(np.abs(x_train - median), axis=0) mad[mad == 0] = 1e-19 x_train /= mad x_test /= mad model = ensemble.GradientBoostingRegressor() model.fit(x_train, y_train) score = r2_score(y_test, model.predict(x_test)) print("R^2: %.4f" % score) categorical_features = np.argwhere( np.array( [len(set(boston.data[:, x])) for x in range(boston.data.shape[1])]) <= 10).flatten() explainer = lime.lime_tabular.LimeTabularExplainer( x_train, feature_names=boston.feature_names, class_names=['price'], categorical_features=categorical_features, mode='regression')
"model": tree.DecisionTreeRegressor(random_state=0)} ) models.append( {"name": "1.10. ExtraTreeRegressor", \ "model": tree.ExtraTreeRegressor(random_state=0)} ) ## 1.11. Ensemble methods # averaging methods models.append( {"name": "1.11.1. Bagging meta-estimator", \ "model": ensemble.BaggingRegressor(neighbors.KNeighborsRegressor())} ) models.append( {"name": "1.11.2.1. Random Forests", \ "model": ensemble.RandomForestRegressor()} ) models.append( {"name": "1.11.2.2. Extremely Randomized Trees", \ "model": ensemble.ExtraTreesRegressor()} ) models.append( {"name": "1.11.3. AdaBoost", \ "model": ensemble.AdaBoostRegressor()} ) models.append( {"name": "1.11.4. Gradient Tree Boosting", \ "model": ensemble.GradientBoostingRegressor()} ) ## 1.12. Multiclass and multilabel algorithms # not regression ## 1.13. Feature selection # not about estimator ## 1.14. Semi-Supervised # all samples have price data, so doesn't apply ## 1.15. Isotonic regression # ValueError("X should be a 1d array") #models.append( {"name": "1.14. Semi-Supervised", \ # "model": IsotonicRegression()} )
df = pd.read_csv("weather_calls.csv") y = df['calls'].as_matrix() del df["calls"] X = df.as_matrix() for i in range(0,10): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i) # Fit regression model model = ensemble.GradientBoostingRegressor( n_estimators=1000, learning_rate=0.029, max_depth=3, min_samples_leaf=3, max_features=0.2, loss='huber', random_state=i ) model.fit(X_train, y_train) err = mean_absolute_error(y_train, model.predict(X_train)) print("Error (train",i,"):", err) err = mean_absolute_error(y_test, model.predict(X_test)) print("Error (test",i,"):", err)
plot_data_3d_regression(tsne_data, y_train) knn_estimator = neighbors.KNeighborsRegressor() knn_grid = {'n_neighbors': list(range(3, 20))} grid_search_plot_one_parameter_curves(knn_estimator, knn_grid, X_train1, y_trans, scoring=scoring) knn_model = get_best_model(knn_estimator, knn_grid, X_train1, y_trans, scoring=scoring) gb_estimator = ensemble.GradientBoostingRegressor() gb_grid = { 'n_estimators': list(range(100, 501, 200)), 'learning_rate': [0.1, 1.0], 'max_depth': [1, 3, 5] } gb_model = get_best_model(gb_estimator, gb_grid, X_train1, y_trans, scoring=scoring) X_test = house3[house_train.shape[0]:] X_test1 = select_features(rf_selector, X_test) house_test['SalePrice'] = np.expm1(gb_model.predict(X_test1))
y = df['sale_price'].as_matrix() #Expected output to predict # Split the data set in a training set (70%) and a test set (30%) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # Fit regression model model = ensemble.GradientBoostingRegressor( n_estimators= 1000, # Tells the model, how many decision trees to build Higher numbers usually allow the model to be more accurate but it increases the amount of time required to run the model learning_rate= 0.1, # Learning rate controls how much each additional decision tree influences the overall prediction. Lower rates usually lead to higher accuracy but only works if we have n_estimators set to a high value. max_depth= 6, # Max_depth controls how many layers deep each individual decision tree can be. We'll start with 6 which means that each decision tree in the model can be up to 6 layers deep. min_samples_leaf= 9, # Min_samples_leaf controls how many times a value must appear in our training set for a decision tree to make a decision based on it. max_features= 0.1, # Max_features is the percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree. loss= 'huber' # Loss controls how scikit-learn calculates the model's error rate or cost as it learns. The huber function does a good job while not being too influenced by outliers in the data set. ) model.fit( X_train, y_train ) # We tell the model to train using our training data set by calling scikit-learn's fit function on the model # Save the trained model to a file so we can use it in other programs joblib.dump(model, 'trained_house_classifier_model.pkl') # Find the error rate on the training set mse = mean_absolute_error(y_train, model.predict(X_train))
from sklearn.model_selection import KFold #import plot_learning_curve csv_file = 'listStoreValue-long.csv' test = np.array(pd.read_csv(csv_file)) X = test[:, 0:6] y = test[:, -1] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) params = { 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'criterion': 'mse' } gradient_boosting_regressor_model = ensemble.GradientBoostingRegressor( **params) crossvalidation = KFold(n_splits=10, random_state=7) model = gradient_boosting_regressor_model.fit(X, y) scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=crossvalidation) #y_pred = gradient_boosting_regressor_model.predict(X_test) #RMSE = np.sqrt(mean_squared_error(y_test,y_pred)) RMSE = np.sqrt(-scores.mean()) print(RMSE) #plt.figure(figsize=(12,6))
def gradientBoost(): params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls'} return ensemble.GradientBoostingRegressor(**params)
def GradientBoostingRegressorTest(self): x_train, y_train, x_test, y_test = Elution().get_data() gbrt = ensemble.GradientBoostingRegressor(n_estimators=100) Elution().try_different_method(gbrt, x_train, y_train, x_test, y_test)
def boosting(parameter): defaults = ['ls', 0.1, 100] split = pad(parameter, defaults) return ensemble.GradientBoostingRegressor(loss=split[0])
trainFrame = dataclean.cleanDataset(dataclean.loadTrainData()) trainData = dataclean.convertPandasDataFrameToNumpyArray(trainFrame) testFrame = dataclean.cleanDataset(dataclean.loadTestData(), True) testData = dataclean.convertPandasDataFrameToNumpyArray(testFrame) trainX = trainData[:, 1:] trainY = trainData[:, 0] testX = testData[:, 1:] """ Cross Validation """ crossvalidationTree = ensemble.GradientBoostingRegressor(n_estimators=400, learning_rate=0.01, max_depth=6, random_state=1, presort=True) cvCount = 10 crossvalidation = Metrics.crossValidationScore( ensemble.GradientBoostingRegressor(random_state=1), trainX, trainY, cvCount=cvCount) xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX, trainY, randomState=1) """ #{'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.01
print("arr2_hi_E_n.shape: ", arr2_hi_E_n.shape) #--- select 70% of sample for training and 30% for testing: offset = int(arr2_hi_E_n.shape[0] * 0.7) arr2_hi_E_train, arr3_hi_E_train = arr2_hi_E_n[: offset], arr3_hi_E[:offset].reshape( -1) # train sample arr2_hi_E_test, arr3_hi_E_test = arr2_hi_E_n[offset:], arr3_hi_E[ offset:].reshape(-1) # test sample print("train shape: ", arr2_hi_E_train.shape, " label: ", arr3_hi_E_train.shape) print("test shape: ", arr2_hi_E_test.shape, " label: ", arr3_hi_E_test.shape) print("training BDTG...") net_hi_E = ensemble.GradientBoostingRegressor(**params) net_hi_E.fit(arr2_hi_E_train, arr3_hi_E_train) net_hi_E mse = mean_squared_error(arr3_hi_E_test, net_hi_E.predict(arr2_hi_E_test)) print("MSE: %.4f" % mse) print("events at training & test samples: ", len(arr_hi_E0)) print("events at train sample: ", len(arr2_hi_E_train)) print("events at test sample: ", len(arr2_hi_E_test)) test_score = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(net_hi_E.staged_predict(arr2_hi_E_test)): test_score[i] = net_hi_E.loss_(arr3_hi_E_test, y_pred) # fig,ax=plt.subplots(ncols=1, sharey=True)
# Create the X and y arrays X = features_df.as_matrix() y = df['sale_price'].as_matrix() # Split the data set in a training set (70%) and a test set (30%) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # Fit regression model model = ensemble.GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=6, min_samples_leaf=9, max_features=0.1, loss='huber', random_state=0 #default parameters ) model.fit(X_train, y_train) # Save the trained model to a file so we can use it in other programs joblib.dump(model, 'trained_house_classifier_model.pkl') # Find the error rate on the training set mse = mean_absolute_error(y_train, model.predict(X_train)) print("Training Set Mean Absolute Error: %.4f" % mse) # Find the error rate on the test set mse = mean_absolute_error(y_test, model.predict(X_test))
def adaboost_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options, classification=False): ''' AdaBoostRegressor/Classifier from scikitlearn. ''' if learn_options['adaboost_version'] == 'python': if not learn_options['adaboost_CV']: if not classification: clf = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], learning_rate=learn_options['adaboost_learning_rate'], n_estimators=learn_options['adaboost_n_estimators'], alpha=learn_options['adaboost_alpha'], subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=learn_options['adaboost_max_depth'], init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) else: clf = en.GradientBoostingClassifier( learning_rate=learn_options['adaboost_learning_rate'], n_estimators=learn_options['adaboost_n_estimators'], subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=learn_options['adaboost_max_depth'], init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf.fit(X[train], y[train].flatten()) y_pred = clf.predict(X[test])[:, None] else: # optimize the parameters if the adaboosted algorithm if learn_options["algorithm_hyperparam_search"] == "bo": print from hyperopt import hp, fmin, tpe, rand def adaboost_scoring_bo(params): # label_encoder = sklearn.preprocessing.LabelEncoder() # label_encoder.fit(y_all['Target gene'].values[train]) # gene_classes = label_encoder.transform(y_all['Target gene'].values[train]) # n_folds = len(np.unique(gene_classes)) cv = sklearn.cross_validation.KFold( y_all['Target gene'].values[train].shape[0], n_folds=20, shuffle=True) est = en.GradientBoostingRegressor( n_estimators=1000, learning_rate=params['learning_rate'], max_depth=params['max_depth'], min_samples_leaf=params['min_samples_leaf'], max_features=params['max_features']) scorer = cross_val_score(est, X[train], y[train].flatten(), cv=cv, n_jobs=20) return np.median(scorer) space = { 'learning_rate': hp.uniform('learning_rate', 0.001, 0.1), 'max_depth': hp.quniform('max_depth', 1, 8, 1), 'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 20, 1), 'max_features': hp.uniform('max_features', 0.05, 1.0) } best = fmin(adaboost_scoring_bo, space, algo=tpe.suggest, max_evals=50, verbose=1) print best clf = en.GradientBoostingRegressor( n_estimators=learn_options['adaboost_n_estimators'], learning_rate=best['learning_rate'], max_depth=best['max_depth'], min_samples_leaf=best['min_samples_leaf'], max_features=best['max_features']) clf.fit(X[train], y[train].flatten()) elif learn_options["algorithm_hyperparam_search"] == "grid": assert not classification, "need to tweak code below to do classificaton, as above" n_jobs = 20 print "Adaboost with GridSearch" from sklearn.grid_search import GridSearchCV #param_grid = {'learning_rate': [0.1, 0.05, 0.01], # 'max_depth': [4, 5, 6, 7], # 'min_samples_leaf': [5, 7, 10, 12, 15], # 'max_features': [1.0, 0.5, 0.3, 0.1]} param_grid = { 'learning_rate': [0.1, 0.01], 'max_depth': [4, 7], 'min_samples_leaf': [5, 15], 'max_features': [1.0, 0.1] } label_encoder = sklearn.preprocessing.LabelEncoder() label_encoder.fit(y_all['Target gene'].values[train]) gene_classes = label_encoder.transform( y_all['Target gene'].values[train]) n_folds = len(np.unique(gene_classes)) cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True) est = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], n_estimators=learn_options['adaboost_n_estimators']) clf = GridSearchCV(est, param_grid, n_jobs=n_jobs, verbose=1, cv=cv, scoring=spearman_scoring, iid=False).fit(X[train], y[train].flatten()) print clf.best_params_ else: raise Exception( "if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)" ) y_pred = clf.predict(X[test])[:, None] else: raise NotImplementedError return y_pred, clf
def double_chain_testing(self, chain_steps, column_to_overwrite_1, column_to_overwrite_2, model_2_inputs, model_2_output, result_indexes=[], mape=False, filename_2=""): tmp_list = [] chain_predictions = [] mse_list = [] mape_list = [] predictions_result = [] if filename_2: model_2 = load(filename_2) else: model_2 = ensemble.GradientBoostingRegressor( n_estimators=self.num_of_trees, max_depth=8, max_features="auto", verbose=0, max_leaf_nodes=32) model_2.fit(X=self.train_frame[model_2_inputs], y=self.train_frame[model_2_output]) for starting_index in tqdm(range(len(self.test_frame) - chain_steps)): tmp_test_frame = self.test_frame.copy() tmp_test_frame["AvgP"] = tmp_test_frame["AvgP"].astype(np.float) for i in range(starting_index, starting_index + chain_steps): tmp_list.append(tmp_test_frame.loc[i, self.inputs]) result = self.model.predict(tmp_list) tmp_list.clear() tmp_test_frame.at[i + 1, column_to_overwrite_1] = result[0] chain_predictions.append(result[0]) tmp_list.append(tmp_test_frame.loc[i, model_2_inputs]) result = model_2.predict(tmp_list) tmp_list.clear() tmp_test_frame.at[i + 1, column_to_overwrite_2] = result[0] mse = mean_squared_error( self.test_frame.loc[starting_index:(starting_index + chain_steps - 1), self.output], np.array(chain_predictions)) mse_list.append(mse) if mape: mape = np.mean( np.abs((np.array( list(self.test_frame.loc[starting_index:( starting_index + chain_steps - 1), self.output])) - np.array(chain_predictions)) / (np.array( list(self.test_frame.loc[starting_index:( starting_index + chain_steps - 1), self.output]))))) * 100 mape_list.append(mape) if (starting_index in result_indexes) or (-1 in result_indexes): predictions_result.append(chain_predictions.copy()) chain_predictions.clear() mse_result = np.sqrt(np.mean(mse_list)) print("MSE: " + str(mse_result)) if mape: mape_result = np.mean(mape_list) print("MAPE: " + str(mape_result)) if len(predictions_result) > 0: return mse_result, mape_result, predictions_result else: return mse_result, mape_result else: if len(predictions_result) > 0: return mse_result, predictions_result else: return mse_result
data_type_nan = data_new[data_new['type'].isnull()] data_type_no_nan = data_new.type.fillna(method='ffill') data_new['type'] = data_type_no_nan data_params_nan = data_new[data_new['params'].isnull()] data_params_no_nan = data_new.params.fillna(data_new.params.median()) data_new['params'] = data_params_no_nan #We have full dataset without any null values or outliers #Define the different models to see which will fit best : linreg = LinearRegression() logreg = LogisticRegression() gbr = ensemble.GradientBoostingRegressor(n_estimators=50, max_depth=8, min_samples_split=2, learning_rate=0.1, loss='ls') tree_clf = tree.DecisionTreeClassifier(criterion='entropy') labels = data_new['duration'] version_1 = data_new.drop(['duration', 'Unnamed: 0'], axis=1) x_train, x_test, y_train, y_test = train_test_split(version_1, labels, test_size=0.10, random_state=2) #Linear Regression linreg.fit(x_train, y_train) lin_reg_score = linreg.score(x_test, y_test) lin_reg_predict = linreg.predict(x_test)
#removing rows with empty values df.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True) # apply one hot encoding df = pd.get_dummies(df, columns = ["Suburb", "CouncilArea", "Type"]) # assign X (indipendent var) and y (dependat variables) X = df.drop('Price', axis = 1) y = df["Price"] # now split dataset - training data and test data - 70% train 30% test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True) # Select and algorithm and config hyperparameters model = ensemble.GradientBoostingRegressor(n_estimators = 150, learning_rate = 0.1, max_depth = 30, min_samples_split = 4, min_samples_leaf = 6, max_features = 0.6, loss = "huber") """ we have selected the gradient boosting - n_estimators = nr of decision trees - learning_reate = rate at which additional trees influence the prediction. - max_depth max number of layers for each decision trees - min_samples_split = min samples to execute a binary split - min_samples_leaf = - max_features = - loss = how the model loss is calculated """ print("Start training the model") # train the prediction model model.fit(X_train, y_train) print("Model trained")
#Imputing and scalling data imp = Imputer(missing_values='NaN', strategy='median', axis=0) scaler = MinMaxScaler() X = imp.fit_transform(X) X = scaler.fit_transform(X) #spliting data to x_train, x_test, y_train, y_test x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) #GradientBoostingRegressor print 'GradientBoostingRegressor' pca = decomposition.PCA() reg_GB = ensemble.GradientBoostingRegressor() pipe = Pipeline(steps=[('pca', pca), ('GradientBoostingRegressor', reg_GB)]) n_components = [23] # 21 n_estimators = [300] #150 learning_rate = [0.05] #0.01 max_depth = [4] #6 min_samples_leaf = [200] #100 estimator = GridSearchCV( pipe, param_grid=dict( pca__n_components=n_components, GradientBoostingRegressor__n_estimators=n_estimators, GradientBoostingRegressor__learning_rate=learning_rate, GradientBoostingRegressor__max_depth=max_depth, GradientBoostingRegressor__min_samples_leaf=min_samples_leaf))
import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.cross_validation import train_test_split from sklearn import ensemble main_data=pd.read_csv("kc_house_data.csv") converting_dates = [1 if values == 2014 else 0 for values in main_data.date] main_data['date']= converting_dates x_set=main_data.drop(['id','price'],axis=1) y_set=main_data['price'] x_trainingSet,x_testingSet,y_trainingSet,y_testingSet = train_test_split(x_set,y_set,test_size=0.15,random_state=2) regression=LinearRegression() regression.fit(x_trainingSet,y_trainingSet) result_reg=regression.score(x_testingSet,y_testingSet) print("Accuracy of Linear Regression Model in percentage : ",result_reg*100) classification= ensemble.GradientBoostingRegressor(n_estimators=400,max_depth=5,min_samples_split = 2,learning_rate=0.7,loss='ls') classification.fit(x_trainingSet,y_trainingSet) result_GBreg=classification.score(x_testingSet,y_testingSet) print("Accuracy of Linear Regression with Gradient Booster in percentage : ",result_GBreg*100)
ndn['categories'] = pd.cut(ndn['Total Cholesterol( mg/dL)'], bins, labels=group_names) x1=categories.astype(int) y1=x1.fillna(x1.mean()) y1=pd.DataFrame(y1) for i, col in enumerate(y1.columns.tolist(), 1): y1.loc[:, col] *= i y1 = y1.sum(axis=1) parameters = { 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.02, 'loss': 'ls' } from sklearn import ensemble from sklearn import metrics classifier = ensemble.GradientBoostingRegressor(**parameters) classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) mse = metrics.mean_squared_error(Y_test, predictions) print('Mean Square Error: {:.3f}'.format(mse)) from sklearn.metrics import label_ranking_average_precision_score label_ranking_average_precision_score(Y_test, predictions) plt.figure(figsize=(16, 12)) plt.scatter(range(predictions.shape[0]), predictions, label='predictions', c='#348ABD', alpha=0.4) plt.scatter(range(Y_test.shape[0]), Y_test, label='actual values', c='#A60628', alpha=0.4) plt.ylim([Y_test.min(), predictions.max()]) plt.xlim([0, predictions.shape[0]]) plt.legend(); test_score = [classifier.loss_(Y_test, Y_pred) for Y_pred in classifier.staged_decision_function(X_test)]
### Shuffling train sets train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5) ### Splitting x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200) x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200) ''' Elastic Net ''' ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st) ''' Gradient Boosting ''' GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber') # Retraining models GB_model = GBest.fit(train_features, train_labels) ENST_model = ENSTest.fit(train_features_st, train_labels) ## Getting our SalePrice estimation Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))) / 2 Final_labels_train = (np.exp(ENST_model.predict(train_features_st)) + np.exp(GB_model.predict(train_features))) / 2 get_score(Final_labels_train, np.exp(train_labels)) ## Saving to CSV #pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission12.csv', index =False)
Player_Career_Data_Training_x = np.array(Player_Career_Data_Training[Features_Selected_From_Correlation]) Player_Career_Data_Training_x = Player_Career_Data_Training_x.copy(order='C') Player_Career_Data_Training_y = np.array(Player_Career_Data_Training[Prediction_Column]) Player_Career_Data_Training_y = Player_Career_Data_Training_y.copy(order='C') # Holdout set related to 4 players Player_Career_Data_Testing_x = np.array(Player_Career_Data_Testing[Features_Selected_From_Correlation]) Player_Career_Data_Testing_x = Player_Career_Data_Testing_x.copy(order = 'C') Player_Career_Data_Testing_y = np.array(Player_Career_Data_Testing[Prediction_Column]) Player_Career_Data_Testing_y = Player_Career_Data_Testing_y.copy(order = 'C') # Global Feature Importance based on gradient boosted regression GBR_Parameters = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.001, 'loss': 'ls'} GBR_Model = ensemble.GradientBoostingRegressor(**GBR_Parameters) GBR_Model.fit(Player_Career_Data_Training_x, Player_Career_Data_Training_y.ravel()) sorted_indices = np.argsort(GBR_Model.feature_importances_)[::-1] for index in sorted_indices: print(f"{Numerical_Features_Model_Validation[index]}: {GBR_Model.feature_importances_[index]}") # Plotting Global feature importance extracted from Gradient Boosted Regressor Feature_Importances = GBR_Model.feature_importances_ sorted_idx = np.argsort(Feature_Importances) pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(8,14)) plt.barh(pos, Feature_Importances[sorted_idx], align='center') plt.yticks(pos, Player_Numerical_Features.columns[sorted_idx],rotation=40, ha="right") plt.xlabel('Relative Importance') plt.ylabel('Feature Names')
X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # ############################################################################# # Fit regression model original_params = { 'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls' } clf = ensemble.GradientBoostingRegressor(**original_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) print("MSE: %.4f" % mse) # ############################################################################# # gbm regularzation - 1) subsampling # - 2) shrinkage # - 3) early stopping for label, color, setting in [('No shrinkage', 'orange', { 'learning_rate': 1.0, 'subsample': 1.0 }), ('subsample=0.5', 'blue', { 'learning_rate': 1.0, 'subsample': 0.5
# ), # ( # 'MLPRegressor', # make_pipeline(MLPRegressor(hidden_layer_sizes=(128,), max_iter=10000)) # ), # ( # 'MLPRegressor(100, 100)', # make_pipeline(MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=100000)) # ), ( 'RandomForestRegressor', make_pipeline(ensemble.RandomForestRegressor()) ), ( 'ensemble_GradientBoostingRegressor', make_pipeline(ensemble.GradientBoostingRegressor(**ensemble_params)) ), ( 'XGBRegressor', make_pipeline(xgb.XGBRegressor(objective="reg:linear", random_state=42)) ), ( 'PolynomialFeatures-2-RandomForestRegressor', make_pipeline(PolynomialFeatures(2), ensemble.RandomForestRegressor()) ), ( 'PolynomialFeatures-2-ensemble_GradientBoostingRegressor', make_pipeline(PolynomialFeatures(2), ensemble.GradientBoostingRegressor(**ensemble_params)) ), (
train.fillna(0, inplace=True) Y = train.loc[:, "SalePrice"] X = train.drop(columns=["SalePrice"]) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X = scaler.fit_transform(X) #Y=scaler.fit_transform(Y.values.reshape(-1,1)) from sklearn import ensemble from sklearn.model_selection import cross_val_score #%% GradientBoosted model = ensemble.GradientBoostingRegressor(n_estimators=110, criterion="mse") scores = cross_val_score(model, X, Y, cv=10) print("*GradientBoosted - R2:") print(scores.mean()) print("*GradientBoosted - Desvio Padrão:") print(scores.std()) #%% RandomForest model = ensemble.RandomForestRegressor(n_estimators=110) scores = cross_val_score(model, X, Y, cv=10) print("\nRandomForest - R2:") print(scores.mean())
def DT_main_seq(start, stop, testGroup, segmentName): print('\n----------Start-----------\n') # (n_estimators, # max_depth, # min_samples_split, # learning_rate, # loss, # start, # stop, # testGroup, # segmentName) = parsingInit() n_estimators = 1000 max_depth = 2 min_samples_split = 2 learning_rate = 0.01 loss = 'ls' flowRates_Train = np.array([i for i in range(start, stop + 10, 10)]) flowRates_Test = np.array( [i for i in range(testGroup, testGroup + 10, 10)]) flowRates_reTrain = np.append(flowRates_Train, flowRates_Test) #The 160 flow rate data is corrupted!! #TODO: recollect the data flowRates_Train = np.delete(flowRates_Train, np.where(flowRates_Train == 160)) flowRates_Test = np.delete(flowRates_Test, np.where(flowRates_Test == 160)) flowRates_reTrain = np.delete(flowRates_reTrain, np.where(flowRates_reTrain == 160)) print('Train: ', flowRates_Train) print('Test: ', flowRates_Test) print('reTrain: ', flowRates_reTrain) print('1. Extracting Data... ') #Train Data X_Train, y_thic_Train, y_flow_Train = getXData(KPI_fileName, objectName, segment_Numbers, flowRates_Train, segmentName, features) featureNames = X_Train.columns #Test Data X_Test, y_thic_Test, y_flow_Test = getXData(KPI_fileName, objectName, segment_Numbers, flowRates_Test, segmentName, features) #ReTrain Data X_reTrain, y_thic_reTrain, y_flow_reTrain = getXData( KPI_fileName, objectName, segment_Numbers, flowRates_reTrain, segmentName, features) #%% Preprocessing Data converting to float32 and removing NaN print('2. Preprocessing Data...') imp1 = Imputer(missing_values='NaN', strategy='mean', axis=0) # imp2 = Imputer(missing_values=0, strategy='mean', axis=0) X_Train, y_thic_Train = preProcess(X_Train, y_thic_Train) X_Train = imp1.fit_transform(X_Train) X_Test, y_thic_Test = preProcess(X_Test, y_thic_Test) X_Test = imp1.fit_transform(X_Test) X_reTrain, y_thic_reTrain = preProcess(X_reTrain, y_thic_reTrain) X_reTrain = imp1.fit_transform(X_reTrain) #%% if not os.path.exists(destinationFolder): os.makedirs(destinationFolder) paramsGBR = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'learning_rate': learning_rate, 'loss': loss } model = ensemble.GradientBoostingRegressor(**paramsGBR) clf_Tr = clone(model) #%% print('3. Building Model with all the Samples...') X_Train, y_thic_Train = shuffle(X_Train, y_thic_Train) print('\t Shape Train: ', X_Train.shape) print('\t DataType Train: ', X_Train.dtype) print('\t Shape Train: ', y_thic_Train.shape) print('\t DataType Train: ', y_thic_Train.dtype) min_max_scaler_Train_X = preprocessing.MinMaxScaler().fit(X_Train) scaler_Train_X = preprocessing.StandardScaler().fit(X_Train) X_Tr = scaler_Train_X.transform(X_Train) X_Tr = min_max_scaler_Train_X.transform(X_Tr) clf_Tr = model.fit(X_Tr, y_thic_Train) #%% print('4. Results for Training:') y_pred1 = clf_Tr.predict(X_Tr) featureImportance(clf_Tr, featureNames, str(testGroup) + '_initialRankings_' + segmentName) mse_Test = mean_squared_error(y_thic_Train, y_pred1) mae_Test = mean_absolute_error(y_thic_Train, y_pred1) medae_Test = median_absolute_error(y_thic_Train, y_pred1) r2_Test = r2_score(y_thic_Train, y_pred1) exvs_Test = explained_variance_score(y_thic_Train, y_pred1) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) #%% print('\n5. Processing emissions Signals for Group ', flowRates_Test, ' ...') X_Test, y_thic_Test = shuffle(X_Test, y_thic_Test) print('\t Shape Test: ', X_Test.shape) print('\t DataType Train: ', X_Test.dtype) print('\t Shape y Test: ', y_thic_Test.shape) print('\t DataType y Test: ', y_thic_Test.dtype) print('6. Transforming emissions Signals for Group ', flowRates_Test, ' ...') X_Te = scaler_Train_X.transform(X_Test) X_Te = min_max_scaler_Train_X.transform(X_Te) print('\t Shape X_Te: ', X_Te.shape) print('\t DataType X_te: ', X_Te.dtype) print('7. Predicting KPI for Signals for Group ', flowRates_Test, ' ...') y_pred_Te = clf_Tr.predict(X_Te) print('8. Results for Predicting KPI for Signals for Group ', flowRates_Test, ' ...') mse_Test = mean_squared_error(y_thic_Test, y_pred_Te) mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te) medae_Test = median_absolute_error(y_thic_Test, y_pred_Te) r2_Test = r2_score(y_thic_Test, y_pred_Te) exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) fileNamecsv = destinationFolder + '/FeatureRanking_' + str( testGroup) + '_' + segmentName + '.csv' print('9. Saving Results', fileNamecsv, ' ...') np.savetxt( fileNamecsv, [[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]], delimiter=',', header= 'Mean Squared Error, Mean Absolute Error, Median Absolute Error,R2 Score, Explained Variance Score', comments='') print('10. Retraining the Model with new emission Signal...') X_reTrain, y_thic_reTrain = shuffle(X_reTrain, y_thic_reTrain) print('\t Shape reTrain: ', y_thic_reTrain.shape) print('\t DataType reTrain: ', y_thic_reTrain.dtype) print('\t Shape y reTrain: ', y_thic_Test.shape) print('\t DataType y reTrain: ', y_thic_Test.dtype) min_max_scaler_Train_X2 = preprocessing.MinMaxScaler().fit(X_reTrain) scaler_Train_X2 = preprocessing.StandardScaler().fit(X_reTrain) X_reTr = scaler_Train_X2.transform(X_reTrain) X_reTr = min_max_scaler_Train_X2.transform(X_reTr) print('\t Shape X_reTr: ', X_reTr.shape) print('\t DataType X_reTr: ', X_reTr.dtype) X_Te = scaler_Train_X.transform(X_Test) X_Te = min_max_scaler_Train_X.transform(X_Te) print('\t Shape X_Te: ', X_Te.shape) print('\t DataType X_Te: ', X_Te.dtype) clf_reTr = model.fit(X_reTr, y_thic_reTrain) print('11. New Results with emission signals Incorporated:') y_pred_Te = clf_reTr.predict(X_Te) mse_Test = mean_squared_error(y_thic_Test, y_pred_Te) mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te) medae_Test = median_absolute_error(y_thic_Test, y_pred_Te) r2_Test = r2_score(y_thic_Test, y_pred_Te) exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) print('12. Saving the new Results', fileNamecsv, ' ...') f = open(fileNamecsv, 'a') df = pd.DataFrame([[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]]) df.to_csv(f, index=False, header=False) f.close() featureImportance(clf_reTr, featureNames, str(testGroup) + '_reTrainedRankings_' + segmentName) print('-----------:Finished!:--------------- \n')