def build_models(df_train_x, df_train_y,df_validation_x, df_validation_y, seed): # prepare the model model = ExtraTreesRegressor(random_state=seed, n_estimators=100) model.fit(df_train_x, df_train_y) # transform the validation dataset predictions = model.predict(df_validation_x) #print(predictions) #print(df_test_y) print(mean_squared_error(df_validation_y, predictions)) print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100) # prepare the model model_rf = RandomForestRegressor(random_state=seed, n_estimators=100) model_rf.fit(df_train_x, df_train_y) # transform the validation dataset predictions_rf = model_rf.predict(df_validation_x) print(mean_squared_error(df_validation_y, predictions_rf)) print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100) params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} model_gb = ensemble.GradientBoostingRegressor(**params) model_gb.fit(df_train_x, df_train_y) # transform the validation dataset predictions_gb = model_gb.predict(df_validation_x) print(mean_squared_error(df_validation_y, predictions_gb)) print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100) return [model, model_rf, model_gb]
def SelectFromETR(X, X_train, X_test, y_train, y_test): from sklearn.ensemble import ExtraTreesRegressor import numpy as np etr = ExtraTreesRegressor(n_estimators=200, n_jobs=2) etr.fit(X_train, y_train) print "R^2 on training set: %f" % etr.score(X_train, y_train) print "R^2 on test set: %f" % etr.score(X_test, y_test) importances = etr.feature_importances_ indices = np.where(importances >= 0.009)[0] X = X.iloc[:, list(indices)] return X
def et_regressor(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame, y_val: np.ndarray) -> tuple: x_trn, x_val = x_trn.copy(), x_val.copy() y_trn, y_val = y_trn.copy(), y_val.copy() model = ExtraTreesRegressor(n_estimators=400, min_samples_leaf=3, n_jobs=-1, random_state=7) _ = model.fit(x_trn, y_trn) training_score = model.score(x_trn, y_trn) validation_score = model.score(x_val, y_val) return model, training_score, validation_score
def ExtraTreesregressor(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training ExtraTreesRegressor...') start_time = self.timer() extr = ExtraTreesRegressor(n_estimators=100) extr.fit(x_tr, y_tr) print("The R2 is: {}".format(extr.score(x_tr, y_tr))) # print("The alpha choose by CV is:{}".format(krrl.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(extr.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/extr.pkl', 'wb') as f: pickle.dump(extr, f) print('Making prediction and saving into a csv') y_test = extr.predict(self.x_test) return y_test
def etr_search(X_train, X_test, y_train, y_test): print "R^2 scores calculated on test set:" n_jobs = 2 n = 600 cv = 0 max_features = 'auto' max_score = {} max_sc = 0 for depth in [9]: for split in range(30, 100, 10): for leaf in range(15, 50, 5): start = time.time() # tuned_parameters = [{'n_estimators': [200, 500, 1000], # 'max_features': ['auto', 'log2'], # 'min_samples_leaf': [1, 10, 50]}] params = {'n_estimators': n, 'max_features': max_features, 'max_depth': depth, 'min_samples_split': split, 'min_samples_leaf':leaf , 'n_jobs': n_jobs} model = ExtraTreesRegressor(n_estimators=n, n_jobs=n_jobs, max_features=max_features, max_depth=depth, min_samples_split=split, min_samples_leaf=leaf) model.fit(X_train, y_train) sc = model.score(X_test, y_test) max_score[sc] = params if sc > max_sc: max_sc = sc end = time.time() print "%0.8f for %r [X_train.shape=%s, cv=%s] %0.2f min" % \ (sc, params, str(X_train.shape), cv, (end-start)/60) print "The best model is:\n%0.8f for %r [X_train.shape=%s, cv=%s]" % \ (max_sc, max_score[max_sc], str(X_train.shape), cv)
def dtrees(X_fit, y_fit, X_eval, y_eval, features, dt_file): #DTrees dtree = tree.DecisionTreeRegressor().fit(X_fit, y_fit) accuracy = dtree.score(X_eval, y_eval) dt_file.write(f'Single Dtree: {accuracy}\n') for feature, imp in zip(features, dtree.feature_importances_): dt_file.write("\tFeature %s: %s\n" % (feature, imp)) pickle.dump(dtree, open('dtree.p', 'wb')) #Random Forest Trees rf_dtree = RandomForestRegressor(n_estimators=8).fit(X_fit, y_fit) accuracy = rf_dtree.score(X_eval, y_eval) dt_file.write(f'Random Forest Dtrees: {accuracy}\n') #Extremely Randomized Trees extra_rf_dtree = ExtraTreesRegressor(n_estimators=8).fit(X_fit, y_fit) accuracy = extra_rf_dtree.score(X_eval, y_eval) dt_file.write(f'Extremely Randomized Dtrees: {accuracy}\n') #Gradient Boosting Trees gb_tree = GradientBoostingRegressor(n_estimators=50, learning_rate=1.0, max_depth=2, random_state=0).fit(X_fit, y_fit) accuracy = gb_tree.score(X_eval, y_eval) dt_file.write(f'Gradient Boosting Dtrees: {accuracy}')
def ExtraTreesPredictor(X_train, y_train, X_test, y_test): extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234) extra_tree.fit(X_train, y_train) extratree_score = extra_tree.score(X_test, y_test) extratree_score extratree_pred = extra_tree.predict(X_test) extratreeRMSE = sqrt(mean_squared_error(y_test, extratree_pred)) print("Root mean squared error: %.2f" % extratreeRMSE) print('R-squared extra trees: %.2f' % r2_score(y_test, extratree_pred)) features = X.columns importances = extra_tree.feature_importances_ indices = np.argsort(importances) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features[indices]) plt.xlabel('Relative Importance') plt.show() plt.scatter(y_test, extratree_pred) plt.xlabel('Measured') plt.ylabel('Predicted') plt.title('Extra Trees Predicted vs Actual') plt.show() chart_regression(extratree_pred, y_test, 'ExtraTrees Predictor') return extratree_score, extratreeRMSE
def extratrees(): regressor = ExtraTreesRegressor(n_estimators=50).fit(X_train, y_train) regressor.fit(X_train, y_train) y_predictions = regressor.predict(X_test) print("Selected Features for Extratrees", regressor.feature_importances_) return (regressor.score(X_test, y_test), sqrt(mean_squared_error(y_test, y_predictions)))
def ET(): global x1, x2, y1, y2, dict1 model = ExtraTreesRegressor() name = "Extra Trees Regressor" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) dict1[name] = score plotgraph(y_pred, name, error, score)
def classfication_ETR(X_train, y_train, X_test, y_test, ss_y, boston): # 使用ExtraTreesRegressor训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中。 etr = ExtraTreesRegressor() # https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected etr.fit(X_train, y_train.ravel()) etr_y_predict = etr.predict(X_test) # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。 print 'R-squared value of ExtraTreesRegessor: ', etr.score(X_test, y_test) print 'The mean squared error of ExtraTreesRegessor: ', mean_squared_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)) print 'The mean absoluate error of ExtraTreesRegessor: ', mean_absolute_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)) # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。 print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
def dummie_columns_extra_trees(train, test): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format( "Extremely Randomized Trees Regression using all but remarks") predicting_columns = list(train._get_numeric_data().columns.values) predicting_columns.remove("LISTPRICE") predicting_columns.remove("SOLDPRICE") rf = ExtraTreesRegressor(n_estimators=300, n_jobs=-1) rf.fit(train[predicting_columns], train["SOLDPRICE"]) score = rf.score(test[predicting_columns], test["SOLDPRICE"]) predictions = rf.predict(test[predicting_columns]) sample_predictions(test, predictions) print "Accuracy: {}\n".format(score) return score, predictions
def dummie_columns_extra_trees(train, test): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") predicting_columns = list(train._get_numeric_data().columns.values) predicting_columns.remove("LISTPRICE") predicting_columns.remove("SOLDPRICE") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1) rf.fit(train[predicting_columns], train["SOLDPRICE"]) score = rf.score(test[predicting_columns], test["SOLDPRICE"]) predictions = rf.predict(test[predicting_columns]) sample_predictions(test, predictions) print "Accuracy: {}\n".format(score) return score, predictions
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1 ) rf.fit(data_train_x, data_train_y) sample_predictions(rf.predict(data_test_x), data_test_y) score = rf.score(data_test_x, data_test_y) cross_validated_scores = cross_val_score( rf, data_test_x, data_test_y, cv=5) print "MSE Accuracy: {}".format(score) print "MSE Across 5 Folds: {}".format(cross_validated_scores) print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
def trainRegressorsAndSave(computeScore=False): for db in dbs: if (not os.path.exists("clfs/" + db)): clf = ExtraTreesRegressor(n_estimators=500, random_state=1, n_jobs=-1) saveTrainedClassifier(db, clf) elif (computeScore): clf = joblib.load("clfs/" + db) if (computeScore): print("Loading test data...") loaded = loadDB(db + ".csv") X_test = loaded[:, 0:-1] y_test = loaded[:, -1] print("Normalized score is {}".format(clf.score(X_test, y_test))) X_test = y_test = 0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format( "Extremely Randomized Trees Regression using all but remarks") rf = ExtraTreesRegressor(n_estimators=300, n_jobs=-1) rf.fit(data_train_x, data_train_y) sample_predictions(rf.predict(data_test_x), data_test_y) score = rf.score(data_test_x, data_test_y) cross_validated_scores = cross_val_score(rf, data_test_x, data_test_y, cv=5) print "MSE Accuracy: {}".format(score) print "MSE Across 5 Folds: {}".format(cross_validated_scores) print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % ( cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
def extra_trees(X,y,n_est): ''' INPUT: Dataframe with features (X), target variable dataframe (y), number of estimators (parameter) OUTPUT: Score of ExtaTrees model ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) ext = ExtraTreesRegressor(n_estimators=n_est) mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std clf = clone(ext) clf = ext.fit(X_train, y_train) scores = ext.score(X_test, y_test) return 'ExtraTrees Score: '+str(scores), dict(zip(cols2, clf.feature_importances_))
def ExtraTrees(Xtrain, Ytrain, Xtest, Ytest): """ Apply the extra trees regressor """ from sklearn.ensemble import ExtraTreesRegressor print('\nExtra trees regressor:') clf = ExtraTreesRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain) print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain))) #find the training error prediction = clf.predict(Xtrain) Etrain = error(prediction, Ytrain) print('Training error: {0}'.format(Etrain)) #find the test error prediction = clf.predict(Xtest) Etrain = error(prediction, Ytest) print('Test error: {0}'.format(Etrain))
def Ensemble_test(): rfr = RandomForestRegressor() rfr.fit(X_train, y_train.ravel()) rfr_y_predict = rfr.predict(X_test) etr = ExtraTreesRegressor() etr.fit(X_train, y_train.ravel()) etr_y_predict = etr.predict(X_test) gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train.ravel()) gbr_y_predict = gbr.predict(X_test) print("对普通随机森林使用R-squared评价标准:{}".format(rfr.score(X_test, y_test))) print("对普通随机森林使用MAE评价标准:{}".format( mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))) print("对普通随机森林使用MSE评价标准:{}".format( mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))) print("\n") print("对极端回归森林使用R-squared评价标准:{}".format(etr.score(X_test, y_test))) print("对极端回归森林使用MAE评价标准:{}".format( mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))) print("对极端回归森林使用MSE评价标准:{}".format( mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))) print("极端回归森林模型中每种特征对预测目标的贡献度:") print( np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0)) print("\n") print("对梯度提升回归树使用R-squared评价标准:{}".format(gbr.score(X_test, y_test))) print("对梯度提升回归树使用MAE评价标准:{}".format( mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict)))) print("对梯度提升回归树使用MSE评价标准:{}".format( mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict))))
def TreeRegressor(): #随机森林 from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor boston = load_boston() X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=33) # ss_X = StandardScaler() # ss_Y = StandardScaler() # # X_train = ss_X.fit_transform(X_train) # X_test = ss_X.transform(X_test) # Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1)) # Y_test = ss_Y.transform(Y_test.reshape(-1, 1)) svr = RandomForestRegressor() svr.fit(X_train, Y_train) y_predict = svr.predict(X_test) print 'the value of default measurement of RandomForestRegressor is', svr.score( X_test, Y_test) etr = ExtraTreesRegressor() etr.fit(X_train, Y_train) y_predict = etr.predict(X_test) print 'the value of default measurement of ExtraTreesRegressor is', etr.score( X_test, Y_test) svr = GradientBoostingRegressor() svr.fit(X_train, Y_train) y_predict = svr.predict(X_test) print 'the value of default measurement of GradientBoostingRegressor is', svr.score( X_test, Y_test) import numpy as np print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
# Defining the clean dataset train_data = harmonize_data(train) test_data = harmonize_data(test) # Feature enginnering train_data["FamilySize"] = train_data["SibSp"]+train_data["Parch"]+1 test_data["FamilySize"] = test_data["SibSp"]+test_data["Parch"]+1 # Defining predictor predictors = ["Sex", "Age", "Pclass", "FamilySize"] #Applying method max_score = 0 best_n = 0 for n in range(23,24): dtc_scr = 0. dtc = ExtraTreesRegressor(max_depth=n) for train, test in KFold(len(train_data), n_folds=10, shuffle=True): dtc.fit(train_data[predictors], train_data["Survived"]) dtc_scr += dtc.score(train_data[predictors], train_data["Survived"])/10 if dtc_scr > max_score: max_score = dtc_scr best_n = n print(best_n, max_score) dtc = ExtraTreesRegressor(max_depth=best_n) # Creating submission create_submission(dtc, train_data, test_data, predictors, "dtcsurvivors.csv")
print('R-squared value of uniform-weighted RandomForestRegressor is', rfr.score(X_test, y_test)) print( 'The mean squared error of uniform-weighted RandomForestRegressor is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) print( 'The mean absolute error of uniform-weighted RandomForestRegressor is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) print() print('R-squared value of uniform-weighted ExtraTreesRegressor is', etr.score(X_test, y_test)) print( 'The mean squared error of uniform-weighted ExtraTreesRegressor is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) print( 'The mean absolute error of uniform-weighted ExtraTreesRegressor is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) # feature importance print( np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0)) print() print('R-squared value of uniform-weighted GradientBoostingRegressor is',
test_pd3['label'] = y_test print compute_ks(test_pd[['label','predict']]) print clf3.feature_importances_ # Top Ten feature_importance = clf3.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) indices = np.argsort(feature_importance)[-10:] plt.barh(np.arange(10), feature_importance[indices],color='dodgerblue',alpha=.4) plt.yticks(np.arange(10 + 0.25), np.array(X.columns)[indices]) _ = plt.xlabel('Relative importance'), plt.title('Top Ten Important Variables') # XTR clf4 = ExtraTreesRegressor(n_jobs=-1, max_depth=10,random_state=0) clf4.fit(x_train, y_train) print clf4.score(x_test, y_test) test_pd4 = pd.DataFrame() test_pd4['predict'] = clf4.predict(x_test) test_pd4['label'] = y_test print compute_ks(test_pd[['label','predict']]) print clf4.feature_importances_ # Top Ten feature_importance = clf4.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) indices = np.argsort(feature_importance)[-10:] plt.barh(np.arange(10), feature_importance[indices],color='dodgerblue',alpha=.4) plt.yticks(np.arange(10 + 0.25), np.array(X.columns)[indices]) _ = plt.xlabel('Relative importance'), plt.title('Top Ten Important Variables')
with open('model.txt','wt') as f: print >> f, xfr with open('estimators_.txt','wt') as f: #f.write(xfr.estimators_) print >> f, xfr.estimators_ with open('feature_importances_.txt','wt') as f: print >> f, xfr.feature_importances_ #with open('oob_score_.txt','wt') as f: #print >> f, xfr.oob_score_ #with open('oob_prediction_.txt','wt') as f: #print >> f, xfr.oob_prediction_ predict_loc_regres = xfr.predict(data_test) if 'target_test' in locals(): score = xfr.score(data_test,target_test) gn = normalized_weighted_gini(target_test,predict_loc_regres,data_test.var11) end = time.clock() #outdf = pd.DataFrame([data_test.ix[:,'id']]) if 'target_test' in locals(): target_test.columns = ['true_target'] outdf = pd.concat([data_test.ix[:,'id'].astype(int),pd.DataFrame(predict_loc_regres,columns=['target']),target_test],axis=1) else: outdf = pd.concat([data_test.ix[:,'id'].astype(int),pd.DataFrame(predict_loc_regres,columns=['target'])],axis=1) out_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+"_predict.csv") outdf.to_csv(out_filename,index=0) if 'target_test' in locals(): print out_filename, score , gn else:
#Extra Trees Regression from sklearn.ensemble import ExtraTreesRegressor extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234) # In[73]: extra_tree.fit(X_train, y_train) # In[74]: extratree_score = extra_tree.score(X_test, y_test) extratree_score # In[75]: extratree_score = extra_tree.score(X_train, y_train) extratree_score # In[76]: extratree_pred = extra_tree.predict(X_test)
X = scaler.transform(X) timeit("Standardizing the data") ''' from sklearn.ensemble import ExtraTreesRegressor #from sklearn.neighbors import KNeighborsRegressor clf = ExtraTreesRegressor(n_estimators=10) #clf = KNeighborsRegressor() clf.fit(X_train, Y_train) timeit("Training") print "Validation score: " + str(clf.score(X_test, Y_test)) timeit("Validation") #score = 0. #wrong = [] #for i, item in enumerate(X_test): # if unconvert(clf.predict(item)[0]) == unconvert(Y_test[i]): # score += 1 # else: # wrong.append((unconvert(clf.predict(item)[0]),unconvert(Y_test[i]))) #score /= len(X_test) #print "Manual validation score: " + str(score) #timeit("Manual validation")
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average') print('Mean Squared Error Value is : ', MSEValue) MdSEValue = median_absolute_error(y_test, y_pred) print('Median Squared Error Value is : ', MdSEValue) print("-------------------------------------") #apply ExtraTreesRegressor regressor = ExtraTreesRegressor(n_estimators=200) regressor.fit(X_train, y_train) #Calculating Details print('ExtraTreesRegressorModel Train Score is : ', regressor.score(X_train, y_train)) print('ExtraTreesRegressorModel Test Score is : ', regressor.score(X_test, y_test)) print('----------------------------------------------------') #prediction and evaluation y_pred = regressor.predict(X_test) print('Predicted Value for ExtraTreesRegressor is : ', y_pred[:5]) print('Predicted Value for ExtraTreesRegressor is : ', y_test[:5]) #Calculating Mean Absolute Error MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average') print('Mean Absolute Error Value is : ', MAEValue)
re = ExtraTreesRegressor(n_estimators =10,criterion='mae',random_state=0) re.fit(X_train,y_train) # In[17]: y_pred = re.predict(X_test) # In[18]: re.score(X_test, y_test) # In[ ]: # plt.figure(figsize=(10, 6)) # #plt.plot(X_test, f(X_test), "b") # plt.scatter(X_train, y_train, c="b", s=20) # plt.plot(X_test, regr.predict(X_test), "r", lw=2) # plt.xlim([-5, 5]) # In[53]:
class mixmodels: def __init__(self,nest=10): self.nest = nest def fit(self,data_train,target): self.target_train = target self.catcol = data_train.filter(like='var').columns.tolist() #start_gbr_tr = time.clock() self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr.fit(data_train,self.target_train) self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean") self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train) #end_gbr_tr = time.clock() #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr #start_xfr_tr = time.clock() self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr.fit(data_train,self.target_train) self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean") self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train) #end_xfr_tr = time.clock() #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr #start_gbr_cat = time.clock() self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_gbr_cat = time.clock() #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat #start_xfr_cat = time.clock() self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_xfr_cat = time.clock() #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat return self def predict(self,data_test): mix_test_list = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") mix_test_list += [pd.Series(self.gbr_tr_fit.predict(transformed_test_gbr))] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") mix_test_list += [pd.Series(self.xfr_tr_fit.predict(transformed_test_xfr))] mix_test_list += [pd.Series(self.gbr_cat_fit.predict(data_test[self.catcol]))] mix_test_list += [pd.Series(self.xfr_cat_fit.predict(data_test[self.catcol]))] mix_test = pd.concat(mix_test_list,1) mix_ave = mix_test.mean(1) mix_ave.name='target' return mix_ave def score(self,data_test,target_test): total_score = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") total_score += [ self.gbr_tr_fit.score(transformed_test_gbr,target_test) ] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") total_score += [ self.xfr_tr_fit.score(transformed_test_xfr,target_test) ] total_score += [ self.gbr_cat_fit.score(data_test[self.catcol],target_test) ] total_score += [ self.xfr_cat_fit.score(data_test[self.catcol],target_test) ] return sum(total_score)/float(len(total_score)) def gini(self,data_test,target_test): weight = data_test.var11 gns = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_tr_fit.predict(transformed_test_gbr).tolist(),weight.tolist()) ] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_tr_fit.predict(transformed_test_xfr).tolist(),weight.tolist()) ] gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ] gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ] return sum(gns)/float(len(gns))
score = gbr.score(X_test, Y_test) print('Problem 2 part 4 Test score : {}'.format(score)) etr = ExtraTreesRegressor(n_estimators=100, max_depth=8,min_samples_leaf=2 ) etr.fit(X_train, Y_train) Y_etr = etr.predict(X_test) score = r2_score(Y_test.values, Y_etr) print('Problem 2 part 5a Test score : {}'.format(score)) score = etr.score(X_test, Y_test) print('Problem 2 part 5b Test score : {}'.format(score)) if(runProblem3): from keras.models import Sequential from keras.layers.core import Activation, Dense, Dropout from keras.callbacks import EarlyStopping #X = dataset[['Feature_5', 'Feature_7','Ret_MinusTwo', 'Ret_MinusOne']+['Ret_{}'.format(i) for i in range(2,121)]] #Y = dataset['Ret_MinusZero'] #X['Feature_5'] = (X['Feature_5'] - np.mean(X['Feature_5']))/np.std(X['Feature_5'])
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估 print('R-squared value of RandoomForestRegressor:', rfr.score(X_test, y_test)) print( 'The mean squared error of RandomForestRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) print( 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) print( '---------------------------------------分界---------------------------------------' ) # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估 print('R-squared value of Exc', etr.score(X_test, y_test)) print( 'The mean squared error of ExtraTreeRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) print( 'The mean absoluate error of ExtraTreeRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) print( '---------------------------------------分界---------------------------------------' ) # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度 print(np.sort(zip(etr.feature_importances_, boston.feature_names), axis=-0)) print(
# 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。 gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train) gbr_y_predict = gbr.predict(X_test) from sklearn.metrics import mean_absolute_error, mean_squared_error # 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。 print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test)) print('The mean squared error of RandomForestRegressor:', mean_squared_error(y_test, rfr_y_predict)) print('The mean absoluate error of RandomForestRegressor:', mean_absolute_error(y_test, rfr_y_predict)) # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。 print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test)) print('The mean squared error of ExtraTreesRegessor:', mean_squared_error(y_test, etr_y_predict)) print('The mean absoluate error of ExtraTreesRegessor:', mean_absolute_error(y_test, etr_y_predict)) # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。 print(zip(etr.feature_importances_, boston.feature_names)) featrue_importance = zip(etr.feature_importances_, boston.feature_names) print(np.sort(list(featrue_importance), axis=0)) # 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。 print('R-squared value of GradientBoostingRegressor:', gbr.score(X_test, y_test)) print('The mean squared error of GradientBoostingRegressor:', mean_squared_error(y_test, gbr_y_predict)) print('The mean absoluate error of GradientBoostingRegressor:',
max_depth=14, max_features="log2") rfr.fit(X_train, y_train) ext = ExtraTreesRegressor(n_estimators=300, max_depth=14, max_features="log2") ext.fit(X_train, y_train) dtr = DecisionTreeRegressor(max_depth=14, max_features="log2") dtr.fit(X_train, y_train) print("Random Forest Model") print("Train Score {}".format(rfr.score(X_train, y_train))) print("Test Score {}".format(rfr.score(X_test, y_test))) print("\n") print("ExtraTreesRegressor Model") print("Train Score {}".format(ext.score(X_train, y_train))) print("Test Score {}".format(ext.score(X_test, y_test))) print("\n") print("DecisonTree Model") print("Train Score {}".format(dtr.score(X_train, y_train))) print("Test Score {}".format(dtr.score(X_test, y_test))) print("\n") print("\n") print("Random Forest Model") print("SMAPE Score {}".format( symmetric_mean_absolute_percentage_error(y_test, rfr.predict(X_test)))) print("ExtraTreesRegressor Model") print("SMAPE Score {}".format( symmetric_mean_absolute_percentage_error(y_test, ext.predict(X_test))))
etr_y_predict = etr.predict(X_test) # 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。 gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train) gbr_y_predict = gbr.predict(X_test) from sklearn.metrics import mean_absolute_error,mean_squared_error # 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。 print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test)) print( 'The mean squared error of RandomForestRegressor:', mean_squared_error(y_test, rfr_y_predict)) print( 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(y_test, rfr_y_predict)) # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。 print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test)) print('The mean squared error of ExtraTreesRegessor:', mean_squared_error(y_test,etr_y_predict)) print('The mean absoluate error of ExtraTreesRegessor:', mean_absolute_error(y_test, etr_y_predict)) # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。 print(zip(etr.feature_importances_, boston.feature_names)) featrue_importance = zip(etr.feature_importances_, boston.feature_names) print(np.sort(list(featrue_importance), axis= 0)) # 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。 print('R-squared value of GradientBoostingRegressor:', gbr.score(X_test, y_test)) print('The mean squared error of GradientBoostingRegressor:', mean_squared_error(y_test, gbr_y_predict)) print('The mean absoluate error of GradientBoostingRegressor:', mean_absolute_error(y_test, gbr_y_predict)) # 许多业界从事商业分析系统开发和搭建的工作者更加青睐于集成模型, #并经常以这些模型的性能表现为基准,与新设计的其他模型性能进行比对。
import pandas as pd conc = pd.read_csv('concrete.csv') conc = conc[conc.columns[1:10]] y = conc['strength'] X = conc.drop(['strength'], axis=1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54, shuffle=True) from sklearn.ensemble import ExtraTreesRegressor clf = ExtraTreesRegressor(n_estimators=382, max_depth=None, min_samples_split=2, random_state=7) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) dic = dict(zip(X.columns, clf.feature_importances_)) for item in sorted(dic.items(), key=lambda x: x[1], reverse=True): print(item[0], round(item[1], 4))
forest = ExtraTreesRegressor(n_estimators=nestt, max_features = maxff, max_depth = maxdd, min_samples_leaf = minss, n_jobs = processorsIn, random_state=int(random.random()*200)) forest.fit(x_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] #featureNames feat_df = pd.DataFrame({'FeatureNm':x_train.columns}) feat_df['Importance'] = forest.feature_importances_ feat_df = pd.DataFrame(feat_df.sort_values(by='Importance', ascending=False)) print(feat_df[:20]) scoress = forest.score(x_test,y_test) shtname = 'F' +str(indx) + 'e' +str(nestt) + '_' +str(maxff)+ '_' +str(maxdd) + '_' +str(minss) + '_' + str(len(x_train)) + '_' + str(int(scoress*1000)) print('Runparams:' + str(shtname)) print('Number of observations in set:' + str(len(x_train))) print("Score: ", int(scoress*1000)) scores_list.append([indx,nestt,maxff,maxdd,minss,len(x_train),int(scoress*1000),mean_squared_error(y_test,forest.predict(x_test)),sample]) print('----------------------') print('----------------------') # Convert the dataframe to an XlsxWriter Excel object. feat_df.to_excel(writer, sheet_name=shtname) indx += 1 score_df = pd.DataFrame(scores_list, columns = ['index','nest','maxfeatures','maxdepth','minss','obs','scores','rsqr','tableName']) score_df.to_excel(writer, sheet_name='overview') # Close the Pandas Excel writer and output the Excel file.
# In[14]: # 使用 R-squared、MSE、MAE 指标对默认配置的随机回归森林在测试集上进行性能评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test)) print('The mean squared error of RandomForestRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) print('The mean absolute error of RandomForestRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))) # ### ExtraTreesRegressor (ETR) # In[51]: # 使用 R-squared、MSE、MAE 指标对默认配置的极端回归森林在测试集上进行性能评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print('R-squared value of ExtraForestRegressor:', etr.score(X_test, y_test)) print('The mean squared error of ExtraForestRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) print('The mean absolute error of ExtraForestRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))) # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度 sorted(zip(etr.feature_importances_, boston.feature_names)) # ### GradientBoostingRegressor (GBR) # In[52]: # 使用 R-squared、MSE、MAE 指标对默认配置的梯度提升回归树在测试集上进行性能评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print('R-squared value of GradientBoostingRegressor:', gbr.score(X_test, y_test)) print('The mean squared error of GradientBoostingRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict)))
###随机森林回归模型### # 使用RandomForestRegressor模型自带的评估模块,并输出评估结果 print '随机森林回归模型预测准确度: ', rfr.score(X_test, y_test) # 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估 # 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用r2_score模块,并输出评估结果 print '确定系数 R-squared : ', r2_score(y_test, rfr_y_predict) # 使用mean_squared_error模块,并输出评估结果 print '均值平方误差 Mean Squared Error: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)) # 使用mean_absolute_error模块,并输出评估结果 print '均值绝对误差 Mean Absoluate Error: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)) ###极端随机森林回归模型### # 使用ExtraTreesRegressor模型自带的评估模块,并输出评估结果 print '极端随机森林回归模型预测准确度: ', etr.score(X_test, y_test) # 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估 # 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用r2_score模块,并输出评估结果 print '确定系数 R-squared : ', r2_score(y_test, etr_y_predict) # 使用mean_squared_error模块,并输出评估结果 print '均值平方误差 Mean Squared Error: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)) # 使用mean_absolute_error模块,并输出评估结果 print '均值绝对误差 Mean Absoluate Error: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)) ###梯度提升回归树模型### # 使用ExtraTreesRegressor模型自带的评估模块,并输出评估结果 print '梯度提升回归树模型预测准确度: ', gbr.score(X_test, y_test) # 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估 # 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
reg8.fit(X_train, y_train) reg1.fit(X_train, y_train) reg2.fit(X_train, y_train) reg3.fit(X_train, y_train) ereg.fit(X_train, y_train) reg4.fit(X_train, y_train) reg5.fit(X_train, y_train) reg6.fit(X_train, y_train) # reg7.fit(X_train, y_train) print("GradientBoostingRegressor:", reg1.score(X_test, y_test)) print("RandomForestRegressor:", reg2.score(X_test, y_test)) print("LinearRegression:", reg3.score(X_test, y_test)) print("VotingRegressor:", ereg.score(X_test, y_test)) print("AdaBoostRegressor:", reg4.score(X_test, y_test)) print("BaggingRegressor:", reg5.score(X_test, y_test)) print("ExtraTreesRegressor:", reg6.score(X_test, y_test)) # print("StackingRegressor:", reg7.score(X_test, y_test)) print("XGBRegressor:", reg8.score(X_test, y_test)) XGBpredictions = reg8.predict(X_test) MAE = mean_absolute_error(y_test, XGBpredictions) print('XGBoost validation MAE = ', MAE) xx = [] # try: # file = open('regression.csv', 'w', newline='') # file_w = csv.writer(file) # except Exception: # print('regression.csv open faild') # exit() # names = ['test', 'prediction'] # file_w.writerow(names)