from sklearn.preprocessing import StandardScaler from sklearn.model_selection import KFold # import KFold from sklearn.metrics import mean_absolute_error, r2_score import RegscorePy df = pd.read_csv('training_dataset.csv') data = df.values x = data[:, 0:51] # all rows, no label y = data[:, 51] # all rows of the labeled column # ('Best parameters:', {'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 1000, 'max_features': 'sqrt', # 'min_samples_split': 2, 'max_depth': None}) model = RandomForestRegressor(n_estimators=1000, random_state=42, bootstrap=False, min_samples_leaf=1, max_features=9, min_samples_split=2, max_depth=None) print(df.columns.tolist()) # Labels are the values we want to predict labels = np.array(df['Sum_NOK']) # Remove the labels from the features # axis 1 refers to the columns df = df.drop('Sum_NOK', axis=1) # Saving feature names for later use feature_list = list(df.columns) # Convert to numpy array df = np.array(df)
for m in models: m.fit(train.ix[:, [2,3,4,5] ] ,train['Target']) preds = m.predict(test.ix[:, [2,3,4,5]]) mae_amt = mean_absolute_error(preds,test['Target']) mae_amts.append(mae_amt) m.fit(train.ix[:, [1,3,4,5] ] ,train['DaysSinceLast']) preds = m.predict(test.ix[:, [1,3,4,5]]) mae_gap = mean_absolute_error(preds,test['DaysSinceLast']) mae_gaps.append(mae_gap) mae_gaps.append(mean_absolute_error(test['DaysSinceLast2'],test['DaysSinceLast'])) mae_amts.append(mean_absolute_error(test['LastPayment'],test['Target'])) return (mae_gaps,mae_amts) models = [ Ridge(alpha=0.1),GradientBoostingRegressor(),RandomForestRegressor()] model_names = ['Ridge Regression','Gradient Boosted Tree','Random Forest', 'Benchmark'] train,test = get_data() mae_gaps,mae_amts = do_estimation(models,train,test) fig1 = pl.figure('Payment Amount MAE') ax1 = pl.subplot(111) ax1.bar(range(len(model_names)),mae_amts,width=0.5) ax1.set_xticks(np.arange(len(model_names))+0.25) ax1.set_xticklabels(model_names) ax1.set_title('Payment Amount MAE') fig1.savefig('MAE_Payment_Amount.png')
# In[791]: predictors=['Item_MRP','Outlet_Type_0','Outlet_5','Years_of_operation'] alg4=DecisionTreeRegressor(max_depth=8,min_samples_leaf=150) modelfit(alg4,traindf,testdf,predictors,target,IDcol,'alg4.csv') coef4=pd.Series(alg4.feature_importances_,predictors).sort_values(ascending=False) coef4.plot(kind='bar',title='Feature importances') # In[797]: from sklearn.ensemble import RandomForestRegressor predictors = [x for x in traindf.columns if x not in [target]+IDcol] alg5=RandomForestRegressor(n_estimators=200,max_depth=5,min_samples_leaf=100,n_jobs=4) modelfit(alg5, traindf, testdf, predictors, target, IDcol, 'alg5.csv') coef5 = pd.Series(alg5.feature_importances_, predictors).sort_values(ascending=False) coef5.plot(kind='bar', title='Feature Importances') # In[799]: predictors = [x for x in traindf.columns if x not in [target]+IDcol] alg6 = RandomForestRegressor(n_estimators=400,max_depth=6, min_samples_leaf=100,n_jobs=4) modelfit(alg6, traindf, testdf, predictors, target, IDcol, 'alg6.csv') coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False) coef6.plot(kind='bar', title='Feature Importances')
'n_estimators': [300, 350, 400, 450], 'learning_rate': [0.5, 1, 2, 4, 6] } clf = GridSearchCV(model, para_dict, cv=4, scoring='r2') clf.fit(X, y1) clf.best_params_ model = AdaBoostRegressor(n_estimators=350, learning_rate=2) model.fit(X_train, y_train) y_pred = model.predict(X_test) print('MSE of Adaboost: ', mean_squared_error(y_test, y_pred)) print('Cross validation score (cv=4) of Adaboost:', cross_val_score(model, X, y1, cv=4).mean()) #Random Forest model = RandomForestRegressor(max_depth=3) para_dict = {'n_estimators': [20, 50, 80]} clf = GridSearchCV(model, para_dict, cv=4, scoring='r2') clf.fit(X, y1) clf.best_params_ clf.best_score_ model = RandomForestRegressor(n_estimators=50, max_depth=3) model.fit(X_train, y_train) y_pred = model.predict(X_test) print('MSE of Random Forest: ', mean_squared_error(y_test, y_pred)) print('Cross validation score (cv=4) of Random Forest:', cross_val_score(model, X, y1, cv=4).mean()) #SVR para_dict = {
from sklearn.ensemble import RandomForestRegressor from sklearn.datasets import make_regression X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100) regr.fit(X, y) print(regr.feature_importances_) print(regr.predict([[0, 0, 0, 0]]))
def reconstructRF(): """ run KFOLD method for random forest regression """ #import packages import os import numpy as np import pandas as pd #from sklearn import metrics #from scipy import stats #import seaborn as sns #import matplotlib.pyplot as plt #from sklearn.model_selection import KFold from datetime import datetime from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" # #load KFOLD result csv file # os.chdir('F:\\06_eraint_results\\sonstig') # kf_dat = pd.read_csv('eraint_randForest_kfold.csv') # #edit the tg names to be usable later on # editName = lambda x: x.split('.csv')[0] # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg']) #cd to the lagged predictors directory os.chdir(dir_in) x = 462 y = 463 #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA #get the number of PCs used during validation # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs'] pca = PCA(0.95) pca.fit(X) X_pca = pca.transform(X) { # #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1) # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) } longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') #%% #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis=1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1) #standardize predictor data dat = pred_for_recon.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat( [pred_for_recon['date'], dat_standardized], axis=1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #%% #model preparation #defining the rf model with number of trees and minimum leaves rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \ random_state = 29) rf.fit(X_pca, y) #get prediction interval def pred_ints(model, X_pca_recon, percentile=95): """ function to construct prediction interval taking into account the result of each regression tree """ err_down = [] err_up = [] preds = [] for pred in model.estimators_: preds.append(pred.predict(X_pca_recon)) preds = np.vstack(preds).T err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \ keepdims = True) err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \ keepdims = True) return err_down.reshape(-1), err_up.reshape(-1) #compute 95% prediction intervals err_down, err_up = pred_ints(rf, X_pca_recon, percentile=95) #reconstructed surge goes here truth = rf.predict(X_pca_recon) correct = 0. for i, val in enumerate(truth): if err_down[i] <= val <= err_up[i]: correct += 1 print(correct * 100 / len(truth), '\n') #final dataframe final_dat = pd.concat([pred_standardized['date'], \ pd.DataFrame([truth, err_down, err_up]).T], axis = 1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] { #plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') #prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) #confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
def rf_r_test(n=10): X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] ens = EnsembleRegressor([RandomForestRegressor(n_estimators=1, max_depth=None, min_samples_split=1, random_state=i) for i in range(n)]).fit(X_train, y_train) return RMSE(X_test, y_test, ens)
X_train, X_test, y_train, y_test = train_test_split(X_opt, y, test_size=0.2, random_state=0) '''#Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) sc_Y = StandardScaler() y_train = sc_Y.fit_transform(y_train.reshape(-1,1))''' # Fitting Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=100, random_state=0) regressor.fit(X_opt, y) #applying k_fold cross validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=regressor, X=X_train, y=y_train, cv=10) accuracies.mean() accuracies.std() # Predicting the Test set results y_train_pred = regressor.predict(X_train) y_pred = regressor.predict(X_test) #y_pred = sc_Y.inverse_transform(y_pred) #accuracy measurement
test_size=0.25, random_state=42) # Create a position map position_map = {} positions = X_observed.position.unique() for i in range(len(positions)): position_map[i] = positions[i] # Declare the position feature importance map position_feature_importance_map = {} # Create a list of models to compare and select the best model to use for imputing price with values of 0. models = [ KNeighborsRegressor(n_neighbors=knr_n_neighbours), RandomForestRegressor(n_estimators=rf_xgb_n_estimators), XGBRegressor(n_estimators=rf_xgb_n_estimators, max_depth=7) ] # Declare the subset models map sub_models_map = {} # Get all the imputations predicted by each regressor. all_imputations = [] reordered_y_train = [] reordered_y_test = [] for i in range(len(models)): model_imputations = [] for position in positions: # Create subsets by using position as price varies by the player's playing position. sub_X_observed_train = X_observed_train[X_observed_train.position ==
from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_predict diabetes = datasets.load_diabetes() X = diabetes.data[:150] y = diabetes.target[:150] model1 = LinearRegression() model2 = SVR(gamma = 'auto') model3 = DecisionTreeRegressor() model4 = RandomForestRegressor(n_estimators = 20) models = [model1 , model2 , model3 , model4] x=0 for m in models: x+=1 for n in range(2,5): print('result of model number : ' , x ,' for cv value ',n,' is \n' , cross_val_predict(m, X, y, cv=n)) print('-----------------------------------') print('=====================================') print('=====================================')
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import r2_score #Example _models = [ RandomForestRegressor(n_estimators=200,criterion='mse',max_depth=20,random_state=42), DecisionTreeRegressor(criterion='mse',max_depth=11,random_state=42), GradientBoostingRegressor(n_estimators=200,max_depth=12) ] learning_mods = pd.DataFrame() temp = {} plot_different_models(models): for model in models: print(model) m = str(model) temp['Model'] = m[:m.index('(')] model.fit(X_train, y_train) temp['R2_Price'] = r2_score(y_test, model.predict(X_test)) print('score on training',model.score(X_train, y_train)) print('r2 score',r2_score(y_test, model.predict(X_test))) learning_mods = learning_mods.append([temp]) learning_mods.set_index('Model', inplace=True) fig, axes = plt.subplots(ncols=1, figsize=(10, 4)) learning_mods.R2_Price.plot(ax=axes, kind='bar', title='R2_Price') plt.show()
plt.plot(range(len(test_y)), test_y, 'r', label='DTTrue Data') plt.plot(range(len(predictDT)), predictDT, 'b', label='DTPredict Data') plt.legend() # 可视化(散点图) plt.subplot(122) plt.scatter(test_y, predictDT) plt.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'k--') plt.xlabel('DTTrue') plt.ylabel('DTPredict') plt.show() ######################################################### # 实现随机森林回归 from sklearn.ensemble import RandomForestRegressor randomForest = RandomForestRegressor() randomForest.fit(train_x, train_y) # 预测 predictRF = randomForest.predict(test_x) # print("预测结果") # print(predictRF) # print("真实结果") # print(test_y) # 评价结果 MSE = metrics.mean_squared_error(predictRF, test_y) RMSE = np.sqrt(metrics.mean_squared_error(predictRF, test_y)) print("RandomForestRegressor 模型MSE: %.5f" % MSE) print("RandomForestRegressor 模型RMSE: %.5f\n" % RMSE) plt.figure(figsize=(15, 5))
x_test_rf1 = x_test.loc[:, pred_cols_rf1] y_test_rf1 = y_test.loc[:, 'fulfill_duration'] # check out max depths for one that doesn't overfit train_scores = [] test_scores = [] train_rmse = [] test_rmse = [] max_depths = list(range(1, 11)) max_depths = max_depths + list(range(12, 32, 2)) for dpth in max_depths: print(f'Calculating results for max depth of {dpth}') mdl_rf1 = RandomForestRegressor(n_estimators=20, max_depth=dpth, random_state=RANDOM_SEED) mdl_rf1.fit(x_train_rf1, y_train_rf1) train_scores.append(mdl_rf1.score(x_train_rf1, y_train_rf1)) test_scores.append(mdl_rf1.score(x_test_rf1, y_test_rf1)) train_rmse.append( np.sqrt(mean_squared_error(y_train_rf1, mdl_rf1.predict(x_train_rf1)))) test_rmse.append( np.sqrt(mean_squared_error(y_test_rf1, mdl_rf1.predict(x_test_rf1)))) from matplotlib.legend_handler import HandlerLine2D #plot the RMSEs for training and test, see if we see train keep going down but test rmse level off line1, = plt.plot(max_depths, train_rmse, 'b', label='Training Data RMSE')
test_data = pd.merge(test_data, new_series[[ 'shop_id', 'item_id', 'date_block_num', 'item_cnt_prev' + str(i) ]], how='left') test_data['item_cnt_prev' + str(i)] = series_agg['item_cnt_prev' + str(i)].fillna(0) def rmse(y, y_hat): return np.sqrt(np.mean((y_hat - y)**2)) #without any parameter model = RandomForestRegressor() model.fit(train_features, train_targets) res = model.predict(test_data) res_train = model.predict(train_features) res_train = model.predict(train_features) train_error = rmse(res_train, train_targets) print(train_error) #Train error= 0.754 #Test error= 4.825 #with maxdepth = 15 model_2 = RandomForestRegressor(max_depth=15) model_2.fit(train_features, train_targets) res_2 = model_2.predict(test_data) res_2_train = model_2.predict(train_features)
labels_iq = labels_iq.tail(300) ### 4. Execute the regresor and make predictions ## San Juan data_features_test_sj = data_features_test.loc[data_features_test['city'] == 'sj'] # Parametrization n_estimators = 50 max_depth = None max_features = len(features_selected_sj) # Random Forest regressor regressor_sj = RandomForestRegressor(n_estimators= n_estimators, max_depth = max_depth, max_features=max_features, criterion='mae', random_state=0) regressor_sj.fit(features_sj, labels_sj) # Prediction pred_sj = [int(round(x)) for x in regressor_sj.predict(data_features_test_sj[features_selected_sj])] data_features_test_sj = data_features_test_sj.assign(total_cases = pred_sj) ## Iquitos data_features_test_iq = data_features_test.loc[data_features_test['city'] == 'iq'] # Normalization of the data max_abs_scaler = preprocessing.MaxAbsScaler() data_features_test_iq_norm = max_abs_scaler.fit_transform(data_features_test_iq[features_selected_iq]) features_iq_norm = max_abs_scaler.fit_transform(features_iq)
df = pd.concat([X_split, connectomes], axis=1) return df, y_split df, y_train = load_combine_data(X_train, merged_data, dmri) X_train_post_hoc = df df_test, y_test = load_combine_data(X_test, merged_data, dmri) X_test_post_hoc = df_test df = df.drop(columns=['eid', '20016-2.0'], axis=1) df_test = df_test.drop(columns=['eid', '20016-2.0'], axis=1) estimator = RandomForestRegressor(n_estimators=250, criterion='mse', n_jobs=-1, verbose=1, random_state=0) pipeline = Pipeline([('imputation', make_union(SimpleImputer(strategy="median"), MissingIndicator())), ('estimator', estimator)]) cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0) param_grid = { 'estimator__max_depth': [5, 10, 20, 40, None], 'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None] } grid_search = GridSearchCV(pipeline,
ExtraTreesClassifier) from sklearn.linear_model import (BayesianRidge, RidgeClassifier, SGDRegressor, SGDClassifier, LinearRegression, LogisticRegression, Lasso, ElasticNet) regression_options = { 'MLPRegressor': { 'model': MLPRegressor(learning_rate='adaptive', max_iter=500, learning_rate_init=.005), 'name': 'MLP NN' }, 'RandomForestRegressor': { 'model': RandomForestRegressor(n_estimators=20, max_features=2), 'name': 'Random Forest' }, 'BayesianRidge': { 'model': BayesianRidge(), 'name': 'Bayesian Ridge' }, 'Lasso': { 'model': Lasso(), 'name': 'Lasso Regressor' }, 'GradientBoostingRegressor': { 'model': GradientBoostingRegressor(max_features=2), 'name': 'Gradient Boost' }, 'ElasticNet': {
dtr = tree.DecisionTreeRegressor(max_depth=2) dtr.fit(hoseing["data"][:, [6, 7]], hoseing["target"]) dot_data = \ tree.export_graphviz( dtr, out_file = None, feature_names=hoseing["feature_names"][6:8], filled = True, impurity = False, rounded = True ) import os os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/' import pydotplus graph = pydotplus.graph_from_dot_data(dot_data) graph.get_nodes()[7].set_fillcolor("#FFF2DD") graph.write_png("./res.png") from sklearn.model_selection import train_test_split #分割训练集 data_train,data_test,target_train,target_test = \ train_test_split(hoseing["data"],hoseing["target"],test_size=0.1,random_state = 42) dtr = tree.DecisionTreeRegressor(random_state=42) dtr.fit(data_train, target_train) print(dtr.score(data_test, target_test)) from sklearn.ensemble import RandomForestRegressor #系统自己调整参数 rfr = RandomForestRegressor(random_state=42) rfr.fit(data_train, target_train) print(rfr.score(data_test, target_test))
X_train = sc_x.fit_transform(x_train) X_test = sc_x.transform(x_test) sc_y = MinMaxScaler() Y_train = sc_y.fit_transform(y_train) Y_test = sc_y.transform(y_test) import keras from keras.models import Sequential from keras.layers import Dense,Dropout model = Sequential() model.add(Dense(units = 10, activation = 'relu', input_shape=(8,))) model.add(Dense(units = 500, activation = 'relu')) model.add(Dense(units = 300, activation = 'relu')) model.add(Dense(units = 1, activation = 'sigmoid')) model.compile(optimizer = 'adam', loss = 'mean_squared_error',metrics=['accuracy']) model.fit(X_train,Y_train, batch_size = 5, epochs = 1000) from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators = 300,random_state=0) model.fit(X_train,Y_train) dataset1 = pd.read_csv('maths - Copy.csv') z = dataset1.iloc[1:,:].values y_pred = model.predict(sc_x.transform(z)) Y_pred = sc_y.inverse_transform(y_pred) Y_pred
## label scalling MaxPrice = max(Prices) Prices = Prices / MaxPrice xtrain, xtest, ytrain, ytest = train_test_split(TrainVector, Prices, test_size=0.05, random_state=42) from sklearn.ensemble import GradientBoostingRegressor gbr = GradientBoostingRegressor() gbr.fit(xtrain, ytrain) from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor() rfr.fit(xtrain, ytrain) def AccuracyPlotter(trueLabels, predictedLabels): size = len(trueLabels) x_y = [0.00, 0.001, 0.002] plt.scatter(trueLabels, predictedLabels) #plt.plot(x_y) plt.show() AccuracyPlotter(ytest, rfr.predict(xtest))
对基于CART的随机森林的调参,主要有: 1,树的个数 2,树的最大深度 3,内部节点最少样本数与叶节点最少样本数 4,特征个数 此外,调参过程中选择的误差函数是均值误差,5倍折叠 ''' X, y = trainData[numFeatures2], trainData['rec_rate'] ''' 网格搜索参数 ''' param_test1 = {'n_estimators': range(10, 80, 5)} #从10-80每5格取一个值 gsearch1 = GridSearchCV(estimator=RandomForestRegressor(min_samples_split=50, min_samples_leaf=10, max_depth=8, max_features='sqrt', random_state=10), param_grid=param_test1, scoring='neg_mean_squared_error', cv=5) gsearch1.fit(X, y) print(gsearch1.best_params_, gsearch1.best_score_) best_n_estimators = gsearch1.best_params_['n_estimators'] #估计出的最佳数个数 param_test2 = { 'max_depth': range(3, 21), 'min_samples_split': range(10, 100, 10) } gsearch2 = GridSearchCV(estimator=RandomForestRegressor( n_estimators=best_n_estimators,
################################################## Random Forest Regressor ##################################################### from sklearn.model_selection import GridSearchCV from sklearn.model_selection import ShuffleSplit from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score param_grid = { "n_estimators" : [100, 200, 300], # default=100 # "max_features" : ["auto", "sqrt", "log2"], #default=auto # "min_samples_split" : [2,4,8], #default=2 # "bootstrap": [True, False], #default=True } RFR = RandomForestRegressor() RFR_cv = GridSearchCV(RFR, param_grid, cv=5, scoring="neg_mean_squared_error") RFR_cv.fit(X_train, Y_train) print(RFR_cv.best_score_ , RFR_cv.best_params_) # Feature Importance feat_labels = X.columns.values importances = RFR_cv.best_estimator_.feature_importances_ indices = np.argsort(importances) rf_importance = pd.DataFrame() rf_importance["features"] = feat_labels rf_importance["importances"] = importances rf_importance = rf_importance.sort_values(["importances"], ascending=0) plt.title('RF Feature Importance')
] x = dataframe.iloc[:, :-1].values y = dataframe[['MEDV']].values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1) from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1) forest.fit(x_train, y_train) y_train_pred = forest.predict(x_train)[:, np.newaxis] y_test_pred = forest.predict(x_test)[:, np.newaxis] print('训练集的均方误差:', mean_squared_error(y_train, y_train_pred)) print('测试集的均方误差:', mean_squared_error(y_test, y_test_pred)) print('训练集的决定系数:', r2_score(y_train, y_train_pred)) print('测试集的决定系数:', r2_score(y_test, y_test_pred)) plt.scatter(y_train_pred, y_train_pred - y_train, color='black',
from sklearn.ensemble import AdaBoostRegressor #from sklearn.datasets import make_regression from sklearn.ensemble import GradientBoostingRegressor # svm regressor from sklearn.svm import SVR print("Done ...") # list model name list print("\n*** Init Models Lists ***") lModels = [] lModels.append(("LinearRegression ", LinearRegression())) lModels.append(("RidgeRegression ", Ridge(alpha=10))) lModels.append(("LassoRegression ", Lasso(alpha=1))) lModels.append(("ElasticNet ", ElasticNet(alpha=1))) lModels.append(("Random Forest ", RandomForestRegressor(random_state=707))) lModels.append(("SVM Regressor ", SVR(C=1.0, epsilon=0.2))) lModels.append(("DecTree Regressor ", DecisionTreeRegressor(random_state=707))) lModels.append(("GradientBoostingRegressor ", GradientBoostingRegressor(random_state=707))) lModels.append( ("AdaBoostRegressor ", AdaBoostRegressor(random_state=707, n_estimators=100))) for vModel in lModels: print(vModel) print("Done ...") ################################ # Regression - Cross Validation ###############################
def randomforest(data): """ implememt RandomForest and report graphical representation of useful result :param data: modified and clean dataset for implementing RandomForest :param run: determine if the user is going to run this program If run is true, this method runs and produce the desired output. """ # create dummy variables for categorical variables data = pd.get_dummies(data) # convert and get the label in Numpy array as required to implement # randomforest labels = np.array(data["kills"]) # get features except for labels and unrelated ones features = data.drop(["kills", "player_slot", "match_id"], axis=1) # store the column names of features feature_list = list(features.columns) # convert features in Numpy array as required to implement randomforest features = np.array(features) # split the data into testing and training groups for once train_features, test_features, train_labels, test_labels = \ train_test_split(features, labels, test_size=0.2) # The baseline predictions are the historical averages baseline_preds = data["kills"].mean() # get absolute mean of baseline error baseline_errors = round(np.mean(abs(baseline_preds - test_labels)), 2) # create list to store values and are used to have an output dataframe # in CSV num_tree = [1, 2, 3, 5, 10, 30, 60, 100] mse_list = [] train_accuracy = [] test_accuracy = [] mean_absolute_error_list = [] # create RandomForest with different number of estimators for tree in num_tree: # build 100 decision trees for this random forest model rf_model = RandomForestRegressor(n_estimators=tree) # train the model using randomforest rf_model.fit(train_features, train_labels) # get predictions of kills from the model created predictions = rf_model.predict(test_features) # store MSE, train accuracy, test_accuracy to the lists mse_list.append(mean_squared_error(test_labels, predictions)) train_accuracy.append(rf_model.score(train_features, train_labels)) test_accuracy.append(rf_model.score(test_features, test_labels)) # get the absolute errors of the prediction and store to list errors = abs(predictions - test_labels) mean_absolute_errors = round(np.mean(errors), 3) mean_absolute_error_list.append(mean_absolute_errors) # store the relevant values from RandomForests into dataframe tree_estimator_data = pd.DataFrame( data={ "n_estimator": num_tree, "MSE": mse_list, "train_accuracy": train_accuracy, "test_accuracy": test_accuracy, "mean_absolute_error": mean_absolute_error_list }) # add baseline error into dataframe to compare with the mean # absolute errors tree_estimator_data["Baseline Error"] = baseline_errors # output data collected from RandomForests into dataframe for # easy access tree_estimator_data.to_csv("user_files/csv_files/randomforest_trees.csv", index=False) # store the feature importance in series with indexes indicating the # name of features feature_imp = pd.Series(rf_model.feature_importances_, index=feature_list).sort_values(ascending=False) # sort out top 10 feature importance top_feature_imp = feature_imp.iloc[0:11] # plot importance feature graph using horizonal bar chart num_feature = np.arange(len(top_feature_imp.index)) performance = np.array(list(top_feature_imp)) fig, ax = plt.subplots() ax.barh(num_feature, performance, align="center") ax.set_yticks(num_feature) ax.set_yticklabels(top_feature_imp.index) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel("Feature Importance") ax.set_title("Important Features in Predicting Number of Kills in Dota2") fig.savefig("user_files/image_files/Important_Features.png", bbox_inches="tight") # plot prediction vs actual kill graph # create a dataframe with predictions of and actual data of # of kills predictions_vs_actual = pd.DataFrame(data={ "prediction": predictions, "label": test_labels }) sns.relplot(x="label", y="prediction", data=predictions_vs_actual) x = np.linspace(data["kills"].min(), data["kills"].max(), 100) y = x plt.plot(x, y, "-r", label="45-degree line") plt.xlabel("Actual Number of Kills") plt.ylabel("Predicted Number of Kills") plt.title("Actual data vs. Prediction on Number of Kills") plt.savefig("user_files/image_files/prediction_actual_kills.png", bbox_inches="tight") # plot tree number vs test accuracy graph sns.relplot(x="n_estimator", y="test_accuracy", kind="line", data=tree_estimator_data) plt.xlabel("Number of Trees in RandomForest") plt.ylabel("Test Accuracy") plt.title("Test Accuracy vs. Number of Estimators in RandomForest") plt.savefig("user_files/image_files/tree_test_accuracy.png", bbox_inches="tight") # plot tree number vs MSE graph sns.relplot(x="n_estimator", y="MSE", kind="line", data=tree_estimator_data) plt.xlabel("Number of Trees in RandomForest") plt.ylabel("Mean Squared Error") plt.title("MSE vs. Number of Estimators in RandomForest") plt.savefig("user_files/image_files/MSE.png", bbox_inches="tight") # plot error difference vs number of estimators graph tree_estimator_data["dif_errors"] = ( tree_estimator_data["Baseline Error"] - tree_estimator_data["mean_absolute_error"]) sns.relplot(x="n_estimator", y="dif_errors", kind="line", data=tree_estimator_data) plt.xlabel("Number of Trees in RandomForest") plt.ylabel("Error Difference") plt.title(" Error Difference vs. Number of Estimators in RandomForest") plt.savefig("user_files/image_files/error_diff.png", bbox_inches="tight")
def rf_tuning(n_estimators=[10, 11, 1], k=5, train_data_path='../data/training_data.csv', save_model=False, tracking_uri="http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("n_estimators", n_estimators) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path=train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps=[('scaling', StandardScaler( )), ('regression', RandomForestRegressor(random_state=RANDOM_SEED))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__n_estimators'] = np.arange( n_estimators[0], n_estimators[1], n_estimators[2]) print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator=pipeline, param_grid=hyperparams, cv=k, scoring='neg_mean_squared_error', n_jobs=-1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) print( f"\nBest parameter set found for the training set:\n{modelCV.best_params_}" ) # Store the index of the best combination best_index = param_list.index(modelCV.best_params_) # Get the best values for hyperparams best_nestimators = modelCV.best_params_['regression__n_estimators'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is the number of trees criteria = 'n_estimators' mlflow.set_tag("criteria", criteria) param_values = range(n_estimators[0], n_estimators[1], n_estimators[2]) # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', RandomForestRegressor(n_estimators=param_value, random_state=RANDOM_SEED, n_jobs=-1))]) param = {'regression__n_estimators': param_value} # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print( "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..." ) final_model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', RandomForestRegressor(n_estimators=param_list[best_index] ['regression__n_estimators'], n_jobs=-1))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Get a barplot with feature importances feature_importances = final_model.named_steps[ 'regression'].feature_importances_ plot_feature_importances(feature_importances, variable_names) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-74.90881962449828 exported_pipeline = make_pipeline( StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=47, p=1, weights="uniform")), RandomForestRegressor(bootstrap=True, max_features=0.25, min_samples_leaf=16, min_samples_split=4, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def project_check_data(): sn_temp_mid = read_temp_mid_sn() # TODO: First dataset df_temp_first = pd.read_csv( 'data/office_1_temperature_supply_points_data_2020-03-05_2020-03-19.csv' ) df_temp_first = modify_df(df_temp_first, 'temp') df_temp_first = df_temp_first[df_temp_first['serialNumber'] == sn_temp_mid] df_target_temp_first = pd.read_csv( 'data/office_1_targetTemperature_supply_points_data_2020-03-05_2020-03-19.csv' ) df_target_temp_first = modify_df(df_target_temp_first, 'target_temp') df_valve_first = pd.read_csv( 'data/office_1_valveLevel_supply_points_data_2020-03-05_2020-03-19.csv' ) df_valve_first = modify_df(df_valve_first, 'valve') # TODO: Second Dataset df_temp_second = pd.read_csv( 'data/office_1_temperature_supply_points_data_2020-10-13_2020-11-02.csv' ) df_temp_second = modify_df(df_temp_second, 'temp') df_temp_second = df_temp_second[df_temp_second['serialNumber'] == sn_temp_mid] df_target_temp_second = pd.read_csv( 'data/office_1_targetTemperature_supply_points_data_2020-10-13_2020-11-01.csv' ) df_target_temp_second = modify_df(df_target_temp_second, 'target_temp') df_valve_second = pd.read_csv( 'data/office_1_valveLevel_supply_points_data_2020-10-13_2020-11-01.csv' ) df_valve_second = modify_df(df_valve_second, 'valve') # TODO: CONCAT FIRST DATASET df_combined_first = pd.concat( [df_temp_first, df_target_temp_first, df_valve_first]) df_combined_first = df_combined_first.resample( pd.Timedelta(minutes=3), label='right').mean().fillna(method='ffill') df_combined_first['valve_last'] = df_combined_first['valve'].shift( 1, fill_value=40) df_combined_first['valve_gt'] = df_combined_first['valve'].shift( -1, fill_value=0) df_combined_first['diff_temp'] = df_combined_first[ 'target_temp'] - df_combined_first['temp'] # TODO: CONCAT SECOND DATASET df_combined_second = pd.concat( [df_temp_second, df_target_temp_second, df_valve_second]) df_combined_second = df_combined_second.resample( pd.Timedelta(minutes=3), label='right').mean().fillna(method='ffill') df_combined_second['valve_last'] = df_combined_second['valve'].shift( 1, fill_value=30) df_combined_second['valve_gt'] = df_combined_second['valve'].shift( -1, fill_value=98.00) df_combined_second['diff_temp'] = df_combined_second[ 'target_temp'] - df_combined_second['temp'] df_combined_first = df_combined_first[1:-1] df_combined_second = df_combined_second[1:-1] df_combined = pd.concat([df_combined_first, df_combined_second]) df_train = df_combined X_train = df_train[['valve', 'temp', 'diff_temp', 'valve_last']].to_numpy() y_train = df_train['valve_gt'].to_numpy() mask = (df_combined.index > '2020-10-29') df_test = df_combined.loc[mask] X_test = df_test[['valve', 'temp', 'diff_temp', 'valve_last']].to_numpy() # model = RandomForestRegressor(criterion='mae')#, min_samples_split=40, random_state=42) # 0.337767500434254 model = RandomForestRegressor(criterion='mae') # 0.337767500434254 model.fit(X_train, y_train) valve_file = 'valve_model.p' pickle.dump(model, open(valve_file, 'wb')) y_predicted = model.predict(X_test) y_test = df_test['valve_gt'].to_numpy() y_last = df_test['valve_last'].to_numpy() print(f'mae base: {metrics.mean_absolute_error(y_test, y_last)}') print(f'mae model: {metrics.mean_absolute_error(y_test, y_predicted)}')
def run_stacked(data, stacked_keys, repeat_idx, drop_na): out_scores = pd.DataFrame() out_predictions = data.copy() for key, sel in stacked_keys.items(): this_data = data[sel] if drop_na == 'local': mask = this_data.dropna().index elif drop_na == 'global': mask = data.dropna().index else: mask = this_data.index X = this_data.loc[mask].values y = data['age'].loc[mask].values fold_idx = data.loc[mask]['fold_idx'].values if drop_na is False: # code missings to make the tress learn from it. X_left = X.copy() X_left[this_data.isna().values] = -1000 X_right = X.copy() X_right[this_data.isna().values] = 1000 assert np.sum(np.isnan(X_left)) == 0 assert np.sum(np.isnan(X_right)) == 0 assert np.min(X_left) == -1000 assert np.max(X_right) == 1000 X = np.concatenate([X_left, X_right], axis=1) for column in sel: score = get_mae(data.loc[mask], column) if column not in out_scores: out_scores[column] = score elif out_scores[column].mean() < np.mean(score): out_scores[column] = score unstacked = out_scores[sel].values idx = unstacked.mean(axis=0).argmin() unstacked_mean = unstacked[:, idx].mean() unstacked_std = unstacked[:, idx].std() print(f'{key} | best unstacked MAE: {unstacked_mean} ' f'(+/- {unstacked_std}') print('n =', len(X)) param_grid = {'max_depth': [4, 6, 8, None]} if X.shape[1] > 10: param_grid['max_features'] = (['log2', 'sqrt', None]) reg = GridSearchCV(RandomForestRegressor(n_estimators=1000, random_state=42), param_grid=param_grid, scoring='neg_mean_absolute_error', iid=False, cv=5) if DEBUG: reg = RandomForestRegressor(n_estimators=1000, max_features='log2', max_depth=6, random_state=42) cv = LeaveOneGroupOut() out_cv = Parallel(n_jobs=1)( delayed(fit_predict_score)( estimator=reg, X=X, y=y, train=train, test=test, test_index=this_data.loc[mask].index[test]) for train, test in cv.split(X, y, fold_idx)) out_cv = zip(*out_cv) predictions = next(out_cv) out_predictions[f'stacked_{key}'] = np.nan for pred in predictions: assert np.all(out_predictions.loc[pred.index]['age'] == pred['y']) out_predictions.loc[pred.index, f'stacked_{key}'] = pred['prediction'].values scores = np.array(next(out_cv)) print(f'{key} | MAE : %0.3f (+/- %0.3f)' % (np.mean(scores), np.std(scores))) out_scores[key] = scores out_scores['repeat_idx'] = repeat_idx out_predictions['repeat_idx'] = repeat_idx return out_scores, out_predictions
clf2 = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.7, max_depth=8, alpha=20, n_estimators=10, verbose=False) clf1 = RandomForestRegressor(n_estimators=50, criterion='mse', max_depth=None, min_samples_split=3, min_samples_leaf=15, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=5, random_state=None, verbose=5, warm_start=False) clf3 = RandomForestClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto',