print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores))) # plot model performance for comparison pyplot.boxplot(results, labels=names, showmeans=True) pyplot.show() # define dataset X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=1) # define the base models level0 = list() level0.append(('knn', KNeighborsRegressor())) level0.append(('cart', DecisionTreeRegressor())) level0.append(('svm', SVR())) # define meta learner model level1 = LinearRegression() # define the stacking ensemble model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5) # fit the model on all available data model.fit(X, y) # make a prediction for one example data = [[ 0.59332206, -0.56637507, 1.34808718, -0.57054047, -0.72480487, 1.05648449, 0.77744852, 0.07361796, 0.88398267, 2.02843157, 1.01902732, 0.11227799, 0.94218853, 0.26741783, 0.91458143, -0.72759572, 1.08842814, -0.61450942, -0.69387293, 1.69169009 ]] yhat = model.predict(data) print('Predicted Value: %.3f' % (yhat))
regr_mlp: MLPRegressor(random_state=random_state), regr_lin_svr: LinearSVR(epsilon=1.5, random_state=random_state), regr_ridge: Ridge(alpha=1, solver='cholesky'), # regr_lasso: Lasso(alpha=0.1), # regr_elastic_net: ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state), regr_adaboost: AdaBoostRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=10), n_estimators=200, learning_rate=0.5, random_state=random_state), regr_bagging: BaggingRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=10), n_estimators=100, max_samples=1.0, bootstrap=True, n_jobs=-1), regr_voting: VotingRegressor( estimators=[(regr_ridge, Ridge(alpha=1, solver='cholesky')), (regr_forest, RandomForestRegressor(n_jobs=-1)), (regr_mlp, MLPRegressor())], n_jobs=-1), regr_grad_boost: GradientBoostingRegressor(n_estimators=150, random_state=random_state), regr_stacking: StackingRegressor( estimators=[(regr_ridge, Ridge(alpha=1, solver='cholesky')), (regr_lin_svr, LinearSVR(epsilon=1.5)), (regr_mlp, MLPRegressor())], final_estimator=RandomForestRegressor(n_jobs=-1, n_estimators=10, random_state=11), n_jobs=-1) } regr_dict_regularized = { regr_forest: RandomForestRegressor(n_jobs=-1, random_state=random_state), regr_dtree: DecisionTreeRegressor(random_state=random_state, max_depth=3), regr_lin: LinearRegression(n_jobs=-1), regr_mlp: MLPRegressor(random_state=random_state), regr_lin_svr: LinearSVR(epsilon=1.5, random_state=random_state), regr_ridge: Ridge(alpha=1, solver='cholesky'), # regr_lasso: Lasso(alpha=0.1), # regr_elastic_net: ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state), regr_adaboost: AdaBoostRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=3), n_estimators=200, learning_rate=0.5, random_state=random_state), regr_bagging: BaggingRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=3), n_estimators=100, max_samples=1.0, bootstrap=True,
def baseline(showPlot): np.set_printoptions(precision=3, suppress=True) full_df = pd.read_csv( '../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_lin_int.csv', infer_datetime_format=True, parse_dates=True) #=========================FIND BEST OFFSET======================================== by_state = full_df['sub_region_1'].unique() linear_scores_by_state = {} lin_avg = 0 log_scores_by_state = {} log_avg = 0 lin_corr_avg = 0 log_corr_avg = 0 for region in by_state: temp = full_df.loc[(full_df['sub_region_1'] == region)] bestLinearCorr = 0 bestLogCorr = 0 bestLinearOffset = -1 bestLogOffset = -1 bestLinearData = 0 bestLogData = 0 correlationScores = [] correlationLogScores = [] for offset in range(30): #Shift CDC data by offset value - this is going to create some problems because we'll have to do if for each state... cdc_dataframe = temp['num_cases'].shift(periods=offset, fill_value=0) #Build new full data array mobility_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) full_dataframe = pd.concat([cdc_dataframe, mobility_dataframe], axis=1) #full_dataframe['originalCases'] = temp['num_cases'] #preserve original case values as additional feature full_dataframe = full_dataframe.loc[( full_dataframe['num_cases'] != 0)] #remove rows with zero cases #Compute linear and logatrithmic correlations linearCorr = full_dataframe.corr() linearCorr = linearCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data logData = np.log(full_dataframe + 1 - np.min(full_dataframe.to_numpy())) logCorr = logData.corr() logCorr = logCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data #print("Offset:", offset, "Correlation: ", linearCorr) #print(" Log Correlation:", logCorr) #Save best values if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr): bestLinearCorr = linearCorr bestLinearOffset = offset bestLinearData = full_dataframe if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr): bestLogCorr = logCorr bestLogOffset = offset bestLogData = logData correlationScores.append(np.linalg.norm(linearCorr)) correlationLogScores.append(np.linalg.norm(logCorr)) if showPlot: plt.plot(correlationScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Linear correlation vs. data offset") plt.show() plt.plot(correlationLogScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Logarithmic correlation vs. data offset") plt.show() print("Best Full Correlation:", bestLinearCorr) print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr)) print("Best Full Offset:", bestLinearOffset) print("Best Log Correlation:", bestLogCorr) print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr)) print("Best Log Offset:", bestLogOffset) linear_scores_by_state[region] = bestLinearOffset log_scores_by_state[region] = bestLogOffset lin_avg += bestLinearOffset log_avg += bestLogOffset lin_corr_avg += np.linalg.norm(bestLinearCorr) log_corr_avg += np.linalg.norm(bestLogCorr) print(linear_scores_by_state) print(log_scores_by_state) print(lin_avg / len(by_state)) print(log_avg / len(by_state)) print(lin_corr_avg / len(by_state)) print(log_corr_avg / len(by_state)) bestLinearOffset = lin_avg // len(by_state) bestLogOffset = log_avg // len(by_state) linearMSE_by_state = [] logMSEAdj_by_state = [] linearCasesMSE_by_state = [] logCasesMSE_by_state = [] logisticMSE_by_state = [] dataNoise_by_state = [] arimaMSE_by_state = [] gaussMSE_by_state = [] for s in range(len(by_state)): #=========================BEGIN MODEL FITTING======================================== #Get the data for that state and shift it bestLinearData = pd.DataFrame() bestLogDf = pd.DataFrame() temp = full_df.loc[(full_df['sub_region_1'] == by_state[s])] temp = temp.loc[(temp['date'] < '2020-11-30')] #Shift CDC data by offset value cdc_lin_dataframe = temp['num_cases'].shift(periods=bestLinearOffset, fill_value=0) mobility_lin_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) all_lin_states = pd.concat([cdc_lin_dataframe, mobility_lin_dataframe], axis=1) all_lin_states = all_lin_states.loc[(all_lin_states['num_cases'] > 0)] #remove rows with zero cases bestLinearData = bestLinearData.append(all_lin_states) #Shift CDC data by offset value cdc_log_dataframe = temp['num_cases'].shift(periods=bestLogOffset, fill_value=0) mobility_log_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) all_log_states = pd.concat([cdc_log_dataframe, mobility_log_dataframe], axis=1) all_log_states = all_log_states.loc[(all_log_states['num_cases'] > 0)] #remove rows with zero cases bestLogDf = bestLogDf.append(all_log_states) bestLogData = np.log(bestLogDf + 1 - np.min(bestLogDf.to_numpy())) linearMSE = [] logMSEAdj = [] linearCasesMSE = [] logCasesMSE = [] logisticMSE = [] dataNoise = [] arimaMSE = [] gaussMSE = [] #Convert data to numpy linearCasesOnly = bestLinearData['num_cases'].to_numpy() logCasesOnly = np.log(linearCasesOnly + 1) bestLinearData = bestLinearData.to_numpy() bestLogData = bestLogData.to_numpy() stride = 3 #trains a new model every {stride} days maxEpoch = 100 for t in range( (min(bestLinearData.shape[0], bestLogData.shape[0]) - 90) // stride): print("Training model:", t) print("State:", by_state[s]) #Linear Mobility Data linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:] linearTrainy = bestLinearData[t * stride:t * stride + 60, :1] linearTestX = bestLinearData[t * stride + 60:t * stride + 90, 1:] linearTesty = bestLinearData[t * stride + 60:t * stride + 90, :1] #Logarithmic Mobility Data logTrainX = bestLogData[t * stride:t * stride + 60, 1:] logTrainy = bestLogData[t * stride:t * stride + 60, :1] logTestX = bestLogData[t * stride + 60:t * stride + 90, 1:] logTesty = bestLogData[t * stride + 60:t * stride + 90, :1] #Cases-only data linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60] logCasesTrainX = logCasesOnly[t * stride:t * stride + 60] linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride + 90] logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 90] timeTrain = np.arange(1, 61).reshape(-1, 1) timeTest = np.arange(61, 91).reshape(-1, 1) #Uncomment to add time data to mobility dataset #linearTrainX = np.hstack((linearTrainX, timeTrain)) #logTrainX = np.hstack((logTrainX, timeTrain)) #linearTestX = np.hstack((linearTestX, timeTest)) #logTestX = np.hstack((logTestX, timeTest)) #fit linear model linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy) predict = linear_model.predict(linearTestX) linearMSE.append(np.abs(predict - linearTesty) / linearTesty) #fit log model linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy) predict = linear_model.predict(logTestX) predictAdj = np.exp(predict) - 1 + np.min(full_dataframe.to_numpy( )) #convert from log back to raw case number logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty) #fit linear cases only model cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX) if showPlot: visualize_cases(cases_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = cases_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit log cases only model cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX) if showPlot: visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX, np.log(timeTest), logCasesTestX) predict = cases_model.predict(np.log(timeTest)) predictAdj = np.exp( predict) - 1 #convert from log back to raw case number logCasesMSE.append( np.abs(predictAdj - linearCasesTestX) / linearCasesTestX) #fit logistic model logistic_model, cov = optimize.curve_fit( logisticDerivative, timeTrain.reshape(linearCasesTrainX.shape), linearCasesTrainX, p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30], maxfev=10000, bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf, np.Inf]))) if showPlot: visualize_logistic(logistic_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) logisticMSE.append( np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX) predict = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) dataNoise.append( np.mean( np.abs(predict - linearCasesTrainX) / linearCasesTrainX)) #fit stacking regressor estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=42), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)))] reg = StackingRegressor(estimators=estimators, final_estimator=GaussianProcessRegressor( kernel=DotProduct() + WhiteKernel(), random_state=0)) stacking_model = reg.fit(timeTrain, linearCasesTrainX) if showPlot: visualize_cases(stacking_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = stacking_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit ARIMA #Perform grid search to determine ARIMA Order '''stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1, max_p = 3, max_q = 3, m = 7, start_P = 0, seasonal = True, d = None, D = 1, trace = True, error_action ='ignore', # we don't want to know if an order does not work suppress_warnings = True, # we don't want convergence warnings stepwise = True) # set to stepwise stepwise_fit.summary()''' model = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)) result = model.fit(disp=False) if False: visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictArima = result.predict(61, 90, typ='levels') arimaMSE.append( np.abs(predictArima - linearCasesTestX) / linearCasesTestX) #Evaluate other models to use as input to gaussian process arima1 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)).fit(disp=False) arima2 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 1, 7)).fit(disp=False) arima3 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(1, 1, 0), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima4 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(0, 1, 1), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima5 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(0, 1, 1), seasonal_order=(2, 1, 0, 7)).fit(disp=False) predictLog = cases_model.predict(np.log(timeTrain)) #Log model predictAdj = np.exp( predictLog) - 1 #convert from log back to raw case number predictLogistic = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model predictArima1 = arima1.predict(1, 60, typ='levels') predictArima2 = arima2.predict(1, 60, typ='levels') predictArima3 = arima3.predict(1, 60, typ='levels') predictArima4 = arima4.predict(1, 60, typ='levels') predictArima5 = arima5.predict(1, 60, typ='levels') testLog = cases_model.predict(np.log(timeTest)) #Log model testAdj = np.exp( testLog) - 1 #convert from log back to raw case number testLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model testArima1 = arima1.predict(61, 90, typ='levels') testArima2 = arima2.predict(61, 90, typ='levels') testArima3 = arima3.predict(61, 90, typ='levels') testArima4 = arima4.predict(61, 90, typ='levels') testArima5 = arima5.predict(61, 90, typ='levels') #fit gaussian process meta-learner gaussTrain = np.array([ predictLogistic, predictArima1, predictArima2, predictArima3, predictArima4, predictArima5 ]).T gaussTest = np.array([ testLogistic, testArima1, testArima2, testArima3, testArima4, testArima5 ]).T reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(), random_state=0) stacking_model = reg.fit(gaussTrain, linearCasesTrainX) predictTrain = stacking_model.predict(gaussTrain) predictTest = stacking_model.predict(gaussTest) if showPlot: visualize_gauss( np.hstack((predictTrain, predictTest)).T, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) gaussMSE.append( np.abs(predictTest - linearCasesTestX) / linearCasesTestX) #Append to state totals linearMSE_by_state.append( np.reshape(np.array(linearMSE).mean(axis=0), (30))) logMSEAdj_by_state.append( np.reshape(np.array(logMSEAdj).mean(axis=0), (30))) linearCasesMSE_by_state.append( np.reshape(np.array(linearCasesMSE).mean(axis=0), (30))) logCasesMSE_by_state.append( np.reshape(np.array(logCasesMSE).mean(axis=0), (30))) logisticMSE_by_state.append( np.reshape(np.array(logisticMSE).mean(axis=0), (30))) dataNoise_by_state.append(np.mean(dataNoise)) arimaMSE_by_state.append( np.reshape(np.array(arimaMSE).mean(axis=0), (30))) gaussMSE_by_state.append( np.reshape(np.array(gaussMSE).mean(axis=0), (30))) print("Average logistic Test error:", np.mean(dataNoise)) #Plot proof-of-concept graph if showPlot: plt.plot(np.array(linearMSE_by_state).mean(axis=0), label='Mobility (linear, non-temporal)') plt.plot(np.array(logMSEAdj_by_state).mean(axis=0), label='Mobility (logarithmic, non-temporal)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() #Plot baseline graph plt.plot(np.array(linearCasesMSE_by_state).mean(axis=0), label='Cases (linear, temporal)' ) #Don't plot because performance is terrible plt.plot(np.array(logCasesMSE_by_state).mean(axis=0), label='Cases (logarithmic temporal)') plt.plot(np.array(logisticMSE_by_state).mean(axis=0), label='Cases (logistic temporal)') plt.plot(np.array(arimaMSE_by_state).mean(axis=0), label='Cases (ARIMA)') plt.plot(np.array(gaussMSE_by_state).mean(axis=0), label='Cases (Gaussian Process meta)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() print("Average logistic test error:", np.mean(dataNoise_by_state))
def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
#X["LON"]=(X["LON"]-mu_lon)/std_lon X=pd.concat([X["AGE"],X["LAT"],X["LON"],gender,marital,ethnicity,race,reasoncode],axis=1) train_X,test_X,train_Y,test_Y=train_test_split(X,Y,test_size=0.3,random_state=123) from sklearn.datasets import load_diabetes from sklearn.linear_model import RidgeCV from sklearn.svm import LinearSVR from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import StackingRegressor #Create an SVR and a RidgeCV model and stack them up to get more accuracy estimators = [('lr', RidgeCV()),('svr', LinearSVR(random_state=42))] #Train the model on training data reg = StackingRegressor(estimators=estimators,final_estimator=RandomForestRegressor(n_estimators=20,random_state=42)) reg.fit(train_X,train_Y) Test the model y_pred = reg.predict(test_X) np.sqrt(MSE(test_Y, y_pred)) #Read the careplan data and clean the data careplan=pd.read_csv("careplans.csv") careplan=careplan.dropna(subset=["REASONCODE"]) patients.rename(columns = {'Id':'PATIENT'}, inplace = True) data_1=pd.merge(careplan,patients,how='left',on='PATIENT') data_1["START"]=pd.to_datetime(data_1["START"].str[:10]) data_1["BIRTHDATE"]=pd.to_datetime(data_1["BIRTHDATE"]) data_1["AGE"]=(data_1["START"]-data_1["BIRTHDATE"]).dt.days/365
def get_model(data, target, use_ensemble=True): params1 = { 'el__alpha': np.logspace(-5, 2, 30), 'el__l1_ratio': np.linspace(0, 1, 3), 'pca__n_components': [2, 5, 10] } params2 = { 'rf__n_estimators': range(10, 101, 30), 'rf__max_depth': [2, 5, 9], 'pca__n_components': [2, 5, 10] } params3 = { 'lgb__learning_rate': np.logspace(-6, 0, 5), 'lgb__n_estimators': range(10, 101, 30), 'lgb__max_depth': [6, 9, 12], 'pca__n_components': [2, 5, 10], 'lgb__num_leaves': [100] } rf = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('rf', RandomForestRegressor())]) el = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('el', ElasticNet(max_iter=5000))]) lgb = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('lgb', LGBMRegressor())]) gr_lgb = GridSearchCV(lgb, params3, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_lgb.fit(data, target) logger.info('Booster params discovered') gr_el = GridSearchCV(el, params1, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_el.fit(data, target) logger.info('ElasticNet params discovered') gr_rf = GridSearchCV(rf, params2, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_rf.fit(data, target) logger.info('RandomForest params discovered') res_scores = { 'elastic': gr_el.best_score_, 'random_forest': gr_rf.best_score_, 'lgbm': gr_lgb.best_score_ } res_est = { 'elastic': gr_el.best_estimator_, 'random_forest': gr_rf.best_estimator_, 'lgbm': gr_lgb.best_estimator_ } if use_ensemble: estimators = [('elastic', gr_el.best_estimator_), ('random_forest', gr_rf.best_estimator_), ('lgbm', gr_lgb.best_estimator_)] stacked = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=100, max_depth=3), passthrough=True) stacked.fit(data, target) logger.info('Ensemble fitted') return stacked return res_est[sorted(res_scores, key=lambda x: (-res_scores[x], x))[0]]
def test_stacking_regressor_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))] rf = RandomForestRegressor(n_estimators=10, random_state=42) reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))], final_estimator=rf, cv=5) reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) reg.fit(X_train, y_train) reg_drop.fit(X_train, y_train) assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
import pandas as pd import numpy as np from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor import sklearn.linear_model as sl models = { "LinearRegression": sl.LinearRegression(), "LassoCV" : sl.LassoCV(), "ElasticNetCV" : sl.ElasticNetCV(alphas=np.linspace(0.038,0.1,100)), "StackingRegressor" : StackingRegressor([('SGD',sl.SGDRegressor()),('GBR', GradientBoostingRegressor())],verbose=1), "RidgeCV" : sl.RidgeCV(), "SGDRegressor" : sl.SGDRegressor(), "Perceptron" : sl.Perceptron() } def read_data(train_file, test_file): full_data = pd.read_csv(train_file) data_sub = pd.read_csv(test_file) full_data["User"] = full_data["User"]-1 full_data["Movie"] = full_data["Movie"]-1 data_sub["User"] = data_sub["User"]-1 data_sub["Movie"] = data_sub["Movie"]-1 return full_data, data_sub
rf = RandomForestRegressor( n_estimators=400, random_state=SEED) # lasso = Lasso(random_state=SEED) ridge = BayesianRidge() svr = SVR() knn = KNeighborsRegressor(27) dt = DecisionTreeRegressor(max_depth=32) gbdt = GradientBoostingRegressor(n_estimators=400, random_state=SEED) base_models = [("lr", lr), ("rf", rf), ("ridge", ridge), ("dt", dt), ("svr", svr), ("knn", knn), ("gbdt", gbdt)] # , ("mlp", mlp)] meta_model = LinearRegression() stacked = StackingRegressor(estimators=base_models, final_estimator=meta_model, n_jobs=6, verbose=2) try: with open("model.pickle", "rb") as m: stacked = pickle.load(m) except FileNotFoundError: pass print("Stacked model baseline RMSE: ", rmse_cv( stacked, X_train, y_train, cv=N_FOLD)) # %%hyper-parameter optimization # param_grid = { # "rf__min_samples_split": [3, 12], # # "rf__n_estimators": [100, 400], # # "gbdt__n_estimators": [100, 400],
def model(xTrain, yTrain, xTest, yTest): kfold = StratifiedKFold(n_splits=3) random_state = 3 classifiers = [] classifiers.append(SVR(C=1.0, epsilon=0.2)) classifiers.append(DecisionTreeRegressor(random_state=random_state)) classifiers.append( RandomForestRegressor(random_state=random_state, n_estimators=100)) classifiers.append(GradientBoostingRegressor(random_state=random_state)) classifiers.append(KNeighborsRegressor()) cv_results = [] for classifier in classifiers: cv_results.append( cross_validate(classifier, xTrain, yTrain, scoring='neg_mean_squared_error', cv=3)) cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(abs(statistics.mean(cv_result['test_score']))) cv_std.append(statistics.stdev(cv_result['test_score'])) cv_res = pd.DataFrame({ "Mean_Squared_Errors": cv_means, "Algorithm": [ "SVR", "Decision Tree", "RandomForest", "GradientBoosting", "KNeighboors" ] }) #print(cv_res) cv_res.plot(kind='bar', x='Algorithm', y='Mean_Squared_Errors') plt.show() ''' # Working with BEST classifiersBest = [] bestRFR = RandomForestRegressor(random_state=random_state, max_depth = 10, max_features = 25, min_samples_leaf = 1, n_estimators=100) bestDTR = DecisionTreeRegressor(random_state=random_state, max_depth = 50, max_features = 25, min_samples_leaf = 1) bestGBR = GradientBoostingRegressor(random_state=random_state, learning_rate = 0.01, max_depth = 3, max_features = 25, min_samples_leaf = 5) # SHOULD GO HERE bestRFR.fit(xTrain,yTrain) bestDTR.fit(xTrain,yTrain) bestGBR.fit(xTrain,yTrain) # added here classifiersBest.append(bestRFR) classifiersBest.append(bestDTR) classifiersBest.append(bestGBR) # stacking estimators = [('RFR', bestRFR), ('DTR', bestDTR), ('GBR', bestGBR)] reg = StackingRegressor(estimators = estimators, final_estimator=RandomForestRegressor(random_state=random_state, n_estimators=10)) classifiersBest.append(reg) cv_results_best = [] for classifier in classifiersBest: cv_results_best.append(cross_validate(classifier, xTrain, yTrain, scoring = 'neg_mean_squared_error', cv = 3)) cv_means_best = [] cv_std_best = [] for cv_result in cv_results_best: cv_means_best.append(abs(statistics.mean(cv_result['test_score']))) cv_std_best.append(statistics.stdev(cv_result['test_score'])) cv_res_best = pd.DataFrame({"Mean_Squared_Errors":cv_means_best,"Algorithm":["RF","DT", "GB", "Ensemble"]}) #print(cv_res) cv_res_best.plot(kind='bar',x='Algorithm',y='Mean_Squared_Errors') plt.show() # UNTIL HERE ''' #Stacking using optimized models and sklearn ''' bestRFR.fit(xTrain,yTrain) bestDTR.fit(xTrain,yTrain) bestGBR.fit(xTrain,yTrain) ''' #print("Ensemble Score: ", reg.fit(xTrain, yTrain).score(xTest, yTest)) ''' #Optimize support vector machine and predict SVM = SVC(probability=True) svc_grid = {'gamma': [ 0.001, 0.01, 0.1, 1], 'C': [1, 10, 50, 100, 250]} gsSVM = GridSearchCV(SVM,param_grid = svc_grid, cv=kfold, n_jobs = -1, scoring="accuracy", verbose = 1) gsSVM.fit(xTrain,yTrain) bestSVM = gsSVM.best_estimator_ print(bestSVM.get_params()) yHat = bestSVM.predict(xTest) fpr, tpr, _ = roc_curve(yTest, yHat) plt.plot(fpr, tpr, label="SVM") print('SVM Accuracy Score: ' + str(accuracy_score(yHat, yTest))) ''' scoreArr = [] #Optimize random forest and predict RFR = RandomForestRegressor() rf_grid = { "max_depth": [1, 3, 5, 10, 20, 50], "max_features": [5, 10, 20, 25], "min_samples_leaf": [1, 5, 10], "n_estimators": [100, 250, 500], } gsRFR = GridSearchCV(RFR, param_grid=rf_grid, cv=3, n_jobs=-1, scoring="neg_mean_squared_error", verbose=2) gsRFR.fit(xTrain, yTrain) print(gsRFR.best_params_) #yHat = bestRFR.predict(xTest) bestRFR = gsRFR.best_estimator_ grid_accuracy = evaluate(bestRFR, xTest, yTest) print("Accuracy: ", grid_accuracy) scoreArr.append(grid_accuracy) #Optimize decision tree and predict KNN = KNeighborsRegressor() knn_grid = {"n_neighbors": [1, 3, 5, 10, 15, 20]} gsKNN = GridSearchCV(KNN, param_grid=knn_grid, cv=3, n_jobs=-1, scoring="neg_mean_squared_error", verbose=2) gsKNN.fit(xTrain, yTrain) #bestDTR = gsDTR.best_params_ print(gsKNN.best_params_) #yHat = bestDTR.predict(xTest) bestKNN = gsKNN.best_estimator_ grid_accuracy = evaluate(bestKNN, xTest, yTest) print("Accuracy: ", grid_accuracy) scoreArr.append(grid_accuracy) #Optimize gradient boosting and predict GBR = GradientBoostingRegressor() gbr_grid = { "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], "max_features": [5, 10, 20, 25], "min_samples_leaf": [1, 5, 10], "max_depth": [1, 3, 5] } gsGBR = GridSearchCV(GBR, param_grid=gbr_grid, cv=3, n_jobs=-1, scoring="neg_mean_squared_error", verbose=2) gsGBR.fit(xTrain, yTrain) #bestGBR = gsGBR.best_params_ print(gsGBR.best_params_) #yHat = bestDTR.predict(xTest) bestGBR = gsGBR.best_estimator_ grid_accuracy = evaluate(bestGBR, xTest, yTest) print("Accuracy: ", grid_accuracy) scoreArr.append(grid_accuracy) # Stacking Ensemble estimators = [('RFR', bestRFR), ('KNN', bestKNN), ('GBR', bestGBR)] #Stacking using optimized models and sklearn reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( random_state=random_state, n_estimators=10)) for x in scoreArr: print("Accuracy: ", x) print("Ensemble Score: ", reg.fit(xTrain, yTrain).score(xTest, yTest)) return None
y_train_pred4 = KR.predict(X_train) # The final prediction model is a Stacked Regressor taking the best esstimates of each of the regressors and combining the accuracy of the preictive models into one more accurate model. # In[18]: from sklearn.linear_model import RidgeCV, LassoCV from sklearn.svm import SVR from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import StackingRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)), ('svr', SVR(C=1, gamma=1e-6))] reg = StackingRegressor( estimators=estimators, final_estimator=GradientBoostingRegressor(random_state=42)) reg.fit(X_train, y_train) y_pred6 = reg.predict(X_test) y_train_pred6 = reg.predict(X_train) # This is a output of the accucy of the predictions # In[24]: from sklearn.metrics import r2_score print("Lasso Train accuracy: ", r2_score(y_train, y_train_pred)) print("Test accuracy: ", r2_score(y_test, y_pred)) print("ElasticNet Train accuracy: ", r2_score(y_train, y_train_pred2))
# ## Ensemble method (stacking) # In[21]: X_train = X_train_c MLP = MLPRegressor(activation = 'relu', alpha = 1, hidden_layer_sizes = (100)) estimators = [ ('RandomForest', RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)), ('MLP', MLP) ] stacking_model = StackingRegressor(estimators=estimators, final_estimator=MLP) scores = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1) print(scores) print(scores.mean()) print(scores.std()) # In[45]: X_train = X_train_c MLP = MLPRegressor(activation = 'relu', alpha = 1, hidden_layer_sizes = (100), random_state=0) estimators = [
# (1) lasso-final stacking = 0.12349 # (2) ridge-final stacking = 0.12436 # estimators = estimators_list # 4 base models for ridge-final stacking model, = 0.12296 / 0.12264 estimators = [ estimators_list[0], estimators_list[5], estimators_list[6], estimators_list[7] ] # 4 base models for lasso-final stacking model, = 0.12284 # estimators = [estimators_list[0],estimators_list[5], # estimators_list[6],estimators_list[4]] print(estimators) stack = StackingRegressor(estimators=estimators, final_estimator=RidgeCV()) # stack = StackingRegressor(estimators=estimators,final_estimator = LassoCV()) # show CV performance of the selected stacking model y_pred, scores = show_CV_performance(X, y, stack, nfold=nfold, title='stack') scores #%% plot single model vs stacking estimators_all = estimators + [('Stacking model', stack)] # plot all in one fig fig, axs = plt.subplots(3, 2, figsize=(10, 8)) axs = np.ravel(axs) for ax, (name, est) in zip(axs, estimators_all): start_time = time.time()
def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])) @pytest.mark.parametrize( "estimator, X, y", [ (StackingClassifier( estimators=[('lr', LogisticRegression( random_state=0)), ('svm', LinearSVC(random_state=0))]), X_iris[:100], y_iris[:100]), # keep only classes 0 and 1 (StackingRegressor(estimators=[( 'lr', LinearRegression()), ('svm', LinearSVR(random_state=0))]), X_diabetes, y_diabetes) ], ids=['StackingClassifier', 'StackingRegressor']) def test_stacking_randomness(estimator, X, y): # checking that fixing the random state of the CV will lead to the same # results estimator_full = clone(estimator) estimator_full.set_params( cv=KFold(shuffle=True, random_state=np.random.RandomState(0))) estimator_drop = clone(estimator) estimator_drop.set_params(lr='drop') estimator_drop.set_params( cv=KFold(shuffle=True, random_state=np.random.RandomState(0)))
def train_algs(self): """ TRAIN WlTHOUT CROSS VALIDATION """ st.subheader("Results") self.chosen_models_names = [] self.chosen_models = [] if len(self.algorithms) == 0: st.warning('You should select at least one algorithm') return X = self.raw_data.drop(self.out_col, axis=1) y = self.raw_data[self.out_col] msk = np.random.rand(len(X)) < self.percent_train / 100 X_train = X[msk] X_test = X[~msk] Y_train = y[msk] Y_test = y[~msk] for alg in self.algorithms: if alg == 'LinearSVR': from sklearn.svm import LinearSVR svc = LinearSVR() svc.fit(X_train, Y_train) st.write("LinearSVR score", svc.score(X_test, Y_test)) self.chosen_models_names.append('LinearSVR') self.chosen_models.append(svc) elif alg == 'RidgeCV': from sklearn.linear_model import RidgeCV rid = RidgeCV() rid.fit(X_train, Y_train) st.write("RidgeCV score", rid.score(X_test, Y_test)) self.chosen_models_names.append('RidgeCV') self.chosen_models.append(rid) elif alg == 'Random Forest Regressor': from sklearn.ensemble import RandomForestRegressor rfc = RandomForestRegressor() rfc.fit(X_train, Y_train) st.write("rfc score", rfc.score(X_test, Y_test)) self.chosen_models_names.append('Random Forest Regressor') self.chosen_models.append(rfc) elif alg == 'Adaboost': from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor() ada.fit(X_train, Y_train) st.write("ada score", ada.score(X_test, Y_test)) self.chosen_models_names.append('Adaboost') self.chosen_models.append(ada) elif alg == 'XGBoost': import xgboost as xgb xgb = xgb.XGBRegressor(n_estimators=300) xgb.fit(X_train, Y_train, verbose=0) st.write("xgb score", xgb.score(X_test, Y_test)) self.chosen_models_names.append('XGBoost') self.chosen_models.append(xgb) if self.meta_model_check: if self.meta_model_type == "voting": from sklearn.ensemble import VotingRegressor stack = VotingRegressor(estimators=list( zip(self.chosen_models_names, self.chosen_models))) stack.fit(X_train, Y_train) st.write("voting score", stack.score(X_test, Y_test)) else: from sklearn.ensemble import StackingRegressor if self.meta_model == "GradientBoostingRegressor": from sklearn.ensemble import GradientBoostingRegressor stack = StackingRegressor( estimators=list( zip(self.chosen_models_names, self.chosen_models)), final_estimator=GradientBoostingRegressor()) elif self.meta_model == "RandomForestRegressor": from sklearn.ensemble import RandomForestRegressor stack = StackingRegressor( estimators=list( zip(self.chosen_models_names, self.chosen_models)), final_estimator=RandomForestRegressor()) stack.fit(X_train, Y_train) st.write("stack score", stack.score(X_test, Y_test))
def run(dataset, config): log.info( f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n") save_metadata(config, version=sklearn.__version__) is_classification = config.type == 'classification' X_train, X_test = dataset.train.X_enc, dataset.test.X_enc y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config estimators_params = { e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final'] } log.info( "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores." .format(config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) if is_classification: estimator = StackingClassifier( estimators=[ ('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])), # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), final_estimator=LogisticRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), stack_method='predict_proba', n_jobs=n_jobs, **training_params) else: estimator = StackingRegressor( estimators=[ ('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])), ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']), final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), n_jobs=n_jobs, **training_params) with utils.Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) probabilities = estimator.predict_proba( X_test) if is_classification else None return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=len(estimator.estimators_) + 1, training_duration=training.duration)
from sklearn.datasets import load_diabetes from sklearn.linear_model import RidgeCV from sklearn.svm import LinearSVR from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import StackingRegressor X, y = load_diabetes(return_X_y=True) estimators = [("lr", RidgeCV()), ("svr", LinearSVR(random_state=42))] reg = StackingRegressor( estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10, random_state=42), ) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test) # 0.3...
@pytest.mark.parametrize( "estimator, X, y", [ ( StackingClassifier(estimators=[ ("lr", LogisticRegression(random_state=0)), ("svm", LinearSVC(random_state=0)), ]), X_iris[:100], y_iris[:100], ), # keep only classes 0 and 1 ( StackingRegressor(estimators=[ ("lr", LinearRegression()), ("svm", LinearSVR(random_state=0)), ]), X_diabetes, y_diabetes, ), ], ids=["StackingClassifier", "StackingRegressor"], ) def test_stacking_randomness(estimator, X, y): # checking that fixing the random state of the CV will lead to the same # results estimator_full = clone(estimator) estimator_full.set_params( cv=KFold(shuffle=True, random_state=np.random.RandomState(0))) estimator_drop = clone(estimator)
'estimators': [('lr', LinearRegression()), ('svm', LinearSVR())], 'final_estimator': RandomForestClassifier() }, ValueError, 'parameter should be a regressor.')]) def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])) @pytest.mark.parametrize("stacking_estimator", [ StackingClassifier( estimators=[('lr', LogisticRegression()), ('svm', LinearSVC())]), StackingRegressor( estimators=[('lr', LinearRegression()), ('svm', LinearSVR(max_iter=1e4))]) ]) def test_stacking_named_estimators(stacking_estimator): stacking_estimator.fit(scale(X_iris), y_iris) estimators = stacking_estimator.named_estimators_ assert len(estimators) == 2 assert sorted(list(estimators.keys())) == sorted(['lr', 'svm']) @pytest.mark.parametrize("stacking_estimator", [ StackingClassifier(estimators=[( 'lr', LogisticRegression()), ( 'rf', RandomForestClassifier()), ('svm', LinearSVC())]), StackingRegressor( estimators=[('lr', LinearRegression()), (
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [("lr", LinearRegression()), ("svr", LinearSVR())] reg = StackingRegressor( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough, ) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr="drop") reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) assert X_trans.shape[1] == 2 reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) assert X_trans.shape[1] == 1
def model_to_test_reg(): estimators = [('dt', DecisionTreeRegressor()), ('las', LinearRegression())] stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=LinearRegression()) return stacking_regressor
gbdt_pipeline = make_pipeline( tree_preprocessor, HistGradientBoostingRegressor(random_state=0) ) gbdt_pipeline # %% from sklearn.ensemble import StackingRegressor from sklearn.linear_model import RidgeCV estimators = [ ("Random Forest", rf_pipeline), ("Lasso", lasso_pipeline), ("Gradient Boosting", gbdt_pipeline), ] stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV()) stacking_regressor # %% # Measure and plot the results ############################################################################## # # Now we can use Ames Housing dataset to make the predictions. We check the # performance of each individual predictor as well as of the stack of the # regressors. # # The function ``plot_regression_results`` is used to plot the predicted and # true targets. import time
fit_param = { 'eval_set': [(X_train, y_train), (X_test, y_test)], 'early_stopping_rounds': 200, 'verbose': False } BT = xgb.XGBRegressor(**param) SVM = svm.SVR() RF = ensemble.RandomForestRegressor(random_state=42) NN = neural_network.MLPRegressor(hidden_layer_sizes=(100, ), random_state=1, max_iter=100, alpha=0.001) estimators = [('dt', DT), ('bt', BT), ('lgb', LGB), ('rf', RF), ('gb', GBoost)] reg = StackingRegressor(estimators=estimators, final_estimator=LinearRegression(), n_jobs=-1) stack = Regressor(reg) y_pred = stack.run(X_train, y_train, X_test, y_test) DT = tree.DecisionTreeClassifier() import xgboost as xgb param = { 'n_estimators': 10000, 'learning_rate': 0.1, 'objective': 'reg:squarederror', 'verbosity': 0, 'n_jobs': -1 } fit_param = {
plt.scatter(X, y) plt.plot(x_rescaled, y_pred_rescaled, color='red', label='predictions') plt.xlabel("LotArea in m$^2$") plt.ylabel("SalePrice in ZAR") plt.title("Voting Ensemble Regression") plt.legend() plt.show() # Heterogeneous Ensembles(Stacking) models = [("LR", lr), ("DT", regr_tree), ("SVR", svr)] # instead of choosing model weights, stacking uses a meta learner # models training happens twice. once for base models, once for meta learner meta_learner_reg = LinearRegression() s_reg = StackingRegressor(estimators=models, final_estimator=meta_learner_reg) s_reg.fit(x_train, y_train[:, 0]) y_pred = s_reg.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f"RMSE: {rmse}") # Plot stacking regression prediction line over data x_domain = np.linspace(min(x_train), max(x_train), 100) y_pred_rescaled = y_scaler.inverse_transform(s_reg.predict(x_domain)) x_rescaled = x_scaler.inverse_transform(x_domain) plt.figure() plt.scatter(X, y)
def train(seeds=[1], k=5, datafilepath='./data/HRB95.txt', test_size=5): seed = 2 random.seed(seed) np.random.seed(seed) # data = np.loadtxt('./data/HRB95.txt', dtype=float, delimiter=',', skiprows=1) # x = data[:,1:data.shape[1]] # y = data[:,0] cv = k if cv == 1: cv = LeaveOneOut() models = [ KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance'), GridSearchCV(SVR(), param_grid={ "C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7) }, n_jobs=-1), RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)), MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700, random_state=seed), RandomForestRegressor(random_state=seed), GradientBoostingRegressor(random_state=seed), StackingRegressor(estimators=[ ('KNN', KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance')), ("ridge", RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))), ("gbdt", GradientBoostingRegressor(random_state=seed)), ("RandomForest", RandomForestRegressor(random_state=seed)), ("mlp", MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700, random_state=seed)), ("svr", GridSearchCV(SVR(), n_jobs=-1, param_grid={ "C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7) })), ], final_estimator=RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)), n_jobs=-1, cv=cv), ] models_str = [ 'KNeighborsRegressor', 'SVR', 'RidgeCV', 'MLP', 'RF', 'GBDT', 'Stacking', ] #times次平均得分, MAE, MSE, R2 = {}, {}, {} for time, seed in enumerate(seeds): print("-----第%d次(seed=%s)-----" % (time + 1, seed)) print("{:20s}{:10s}{:10s}{:10s}".format("方法", "MAE", "MSE", "R2")) x, y = loadXY(datafilepath) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=seed, shuffle=True) plt.figure(time, figsize=(10, 10)) plt.tick_params(labelsize=18) # plt.xlim(0, 6) # plt.ylim(3, 7, 0.3) # plt.plot([x for x in range(1, test_size + 1)],scale_y.inverse_transform(y_test),label='True Label') plt.scatter([x for x in range(1, test_size + 1)], scale_y.inverse_transform(y_test), marker='*', label='True Label', s=250) for i, name, m in zip(range(100), models_str, models): if not name in MAE.keys(): MAE[name] = [] if not name in MSE.keys(): MSE[name] = [] if not name in R2.keys(): R2[name] = [] print("%18s" % name) y_vals, y_val_p_s, mae_test, mse_test, r2_test = [], [], [], [], [] model = clone(m) # stacking模型,已经内置交叉验证 if isinstance(model, StackingRegressor): model.fit(x_train, y_train) train_pred = model.predict(x_train) test_pred = model.predict(x_test) MAE[name] = np.append(MAE[name], mae(test_pred, y_test)) MSE[name] = np.append(MSE[name], mse(test_pred, y_test)) R2[name] = np.append(R2[name], model.score(x_test, y_test)) print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format( "train", mae(train_pred, y_train), mse(train_pred, y_train), model.score(x_train, y_train))) print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format( "test", MAE[name][-1], MSE[name][-1], R2[name][-1])) else: # 交叉验证 if k > 1: kf = RepeatedKFold(n_splits=k, n_repeats=10, random_state=seed) else: kf = LeaveOneOut() for t, v in kf.split(x_train): model.fit(x_train[t], y_train[t]) # fitting y_val_p = model.predict(x_train[v]) y_vals = np.append(y_vals, y_train[v]) y_val_p_s = np.append(y_val_p_s, y_val_p) test_pred = model.predict(x_test) mse_test = np.append(mse_test, mse(y_test, test_pred)) mae_test = np.append(mae_test, mae(y_test, test_pred)) r2_test = np.append(r2_test, model.score(x_test, y_test)) matrix = { 'val': { 'mae': mae(y_vals, y_val_p_s), 'mse': mse(y_vals, y_val_p_s), 'r2': r2_score(y_vals, y_val_p_s) }, 'test': { 'mae': mae_test.mean(), 'mse': mse_test.mean(), 'r2': r2_test.mean() }, } print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format( "val", matrix['val']['mae'], matrix['val']['mse'], matrix['val']['r2'], )) print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format( "test", matrix['test']['mae'], matrix['test']['mse'], matrix['test']['r2'])) joblib.dump(model, 'save/%s%d.model' % (name, time)) MAE[name] = np.append(MAE[name], matrix['test']['mae']) MSE[name] = np.append(MSE[name], matrix['test']['mse']) R2[name] = np.append(R2[name], matrix['test']['r2']) plt.plot([x for x in range(1, test_size + 1)], scale_y.inverse_transform(model.predict(x_test)), marker='o', linestyle=':', label=name, c=colors.pop()) # plt.scatter([x+i*0.2 for x in range(1, test_size + 1)], scale_y.inverse_transform(model.predict(x_test)), # label=name,c=randomcolor()) plt.legend(edgecolor='black', loc=1, prop=font2, ncol=2) # 让图例标签展示 plt.xlabel(u"Test Data", fontdict=font1) # X轴标签 plt.ylabel('Density (g/cm3)', fontdict=font1) # Y轴标签 plt.title('Prediction on GI20', fontdict=font1) # 标题 plt.ioff() print() #所有模型交叉训练结束(一次) 每一次样本集不一样 plt.show() print("---------%d次训练测试平均得分----------" % len(seeds)) print("{:20s}{:10s}{:10s}{:10s}".format("方法", "MAE", "MSE", "R2")) for name in MAE.keys(): print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(name, np.mean(MAE[name]), np.mean(MSE[name]), np.mean(R2[name])))
glm = TweedieGLM(power=0, max_iter=1000) mars = Earth() gb = HistGradientBoostingRegressor() estimators = [ ('mars', mars), ('gb', gb) ] final_estimator = Pipeline([ ('poly', PolynomialFeatures(2)), ('scale', StandardScaler()), ('pca', PCA()), ('regressor', LinearRegression())]) stack = StackingRegressor( estimators=estimators, final_estimator=final_estimator ) model = Pipeline([('transformer', transformer), ('model', stack)]) offset = 1e-9 def add(y): return np.log(y + offset) def subtract(y): return (np.exp(y) - offset) link = Pipeline([('function', FunctionTransformer(add, subtract, validate=True))]) scorer = get_scorer('neg_root_mean_squared_error')
def train_algs_cv(self): """ TRAIN USING CROSS VALIDATION """ st.subheader("Results using cross validation") self.chosen_models_names = [] self.chosen_models = [] if len(self.algorithms) == 0: st.warning('You should select at least one algorithm') return X_train = self.raw_data.drop(self.out_col, axis=1) Y_train = self.raw_data[self.out_col] for alg in self.algorithms: if alg == 'LinearSVR': from sklearn.svm import LinearSVR svc = LinearSVR() svc_scores = cross_val_score(svc, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('LinearSVR score :', svc_scores.mean()) self.chosen_models_names.append('LinearSVR') self.chosen_models.append(svc) elif alg == 'RidgeCV': from sklearn.linear_model import RidgeCV rid = RidgeCV() rid_scores = cross_val_score(rid, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('RidgeCV score :', rid_scores.mean()) self.chosen_models_names.append('RidgeCV') self.chosen_models.append(rid) elif alg == 'Random Forest Regressor': from sklearn.ensemble import RandomForestRegressor rfc = RandomForestRegressor() rfc_scores = cross_val_score(rfc, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('Random Forest Regressor score :', rfc_scores.mean()) self.chosen_models_names.append('Random Forest Regressor') self.chosen_models.append(rfc) elif alg == 'Adaboost': from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor() ada_scores = cross_val_score(ada, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('Adaboost score :', ada_scores.mean()) self.chosen_models_names.append('Adaboost') self.chosen_models.append(ada) elif alg == 'XGBoost': import xgboost as xgb xgb = xgb.XGBRegressor(n_estimators=300) xgb_scores = cross_val_score(xgb, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('xgb score :', xgb_scores.mean()) self.chosen_models_names.append('XGBoost') self.chosen_models.append(xgb) if self.meta_model_check: if self.meta_model_type == "voting": from sklearn.ensemble import VotingRegressor stack = VotingRegressor(estimators=list( zip(self.chosen_models_names, self.chosen_models))) stack_scores = cross_val_score(stack, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write('voting score :', stack_scores.mean()) else: from sklearn.ensemble import StackingRegressor if self.meta_model == "GradientBoostingRegressor": from sklearn.ensemble import GradientBoostingRegressor stack = StackingRegressor( estimators=list( zip(self.chosen_models_names, self.chosen_models)), final_estimator=GradientBoostingRegressor()) elif self.meta_model == "RandomForestRegressor": from sklearn.ensemble import RandomForestRegressor stack = StackingRegressor( estimators=list( zip(self.chosen_models_names, self.chosen_models)), final_estimator=RandomForestRegressor()) stack_scores = cross_val_score(stack, X_train, Y_train, scoring='r2', cv=self.k_cv) st.write(self.meta_model + ' stack score using cv :', stack_scores.mean())
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR @pytest.mark.parametrize( "X, y, estimator", [(*make_classification(n_samples=10), StackingClassifier( estimators=[('lr', LogisticRegression()), ( 'svm', LinearSVC()), ('rf', RandomForestClassifier())])), (*make_classification(n_samples=10), VotingClassifier( estimators=[('lr', LogisticRegression()), ( 'svm', LinearSVC()), ('rf', RandomForestClassifier())])), (*make_regression(n_samples=10), StackingRegressor( estimators=[('lr', LinearRegression()), ( 'svm', LinearSVR()), ('rf', RandomForestRegressor())])), (*make_regression(n_samples=10), VotingRegressor( estimators=[('lr', LinearRegression()), ( 'svm', LinearSVR()), ('rf', RandomForestRegressor())]))], ids=[ 'stacking-classifier', 'voting-classifier', 'stacking-regressor', 'voting-regressor' ]) def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator): # check that the behavior of `estimators`, `estimators_`, # `named_estimators`, `named_estimators_` is consistent across all # ensemble classes and when using `set_params()`. # before fit
def learner(X, y, tes, X_train, X_test, y_train, y_test): # In stacking, the most important thing is model diversification. from linear, SVM, KNN and Decision trees and many variations of them. # The variations are different values of key parameters of each model. # While we did not have the time to tune parameters of each model, except the meta learner Catboost, educated guesses on # the parameters were made to have as much variability as possible. estimators_1 = [ ('xgb', XGBRegressor(random_state=2020, objective='reg:squarederror', learning_rate=0.05)), ('lr', LinearRegression()), ('rf', RandomForestRegressor(random_state=2020)), ('lgb', LGBMRegressor(learning_rate=0.2, random_state=2020)), ('svr', SVR(degree=2)), ('lasso', Lasso(random_state=2020)), ('RGF', RGFRegressor()), ('kneiba', KNeighborsRegressor(n_neighbors=4)), ('cat', CatBoostRegressor(logging_level='Silent', random_state=2020)) ] predictions_1 = StackingRegressor(estimators=estimators_1, final_estimator=CatBoostRegressor( logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020)).fit( X_train, y_train).predict(tes) estimators_2 = [('xgb', XGBRegressor(objective='reg:squarederror', learning_rate=0.2, random_state=2020)), ('lr', LinearRegression()), ('rf', RandomForestRegressor(random_state=2020)), ('lgb', LGBMRegressor(learning_rate=0.05, random_state=2020)), ('svr', SVR(degree=5)), ('RGF', RGFRegressor()), ('lasso', Lasso(random_state=2020)), ('kneiba', KNeighborsRegressor(n_neighbors=6)), ('cat', CatBoostRegressor(logging_level='Silent', random_state=2020))] predictions_2 = StackingRegressor(estimators=estimators_2, final_estimator=CatBoostRegressor( logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020)).fit( X_train, y_train).predict(tes) predictions_cat_1 = CatBoostRegressor(logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020).fit( X_train, y_train).predict(tes) # Further averaging, blending and retraining to generalise well # While the ratios are greater than one, it still works a treat. This is definitely one of the parameters to tune to achieve great results. stack = [x * 0.56 + y * 0.51 for x, y in zip(predictions_1, predictions_2)] stack_2 = [x * 0.56 + y * 0.51 for x, y in zip(stack, predictions_cat_1)] X, y = tes.copy(), stack_2 preds_ridge = Ridge(random_state=2020).fit(X, y).predict(X) # We added a new feature to the test dataset, where we clustered the wards to 150 clusters, then used Catboost's encoder to encode the clusters. X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X) preds_cat = CatBoostRegressor(random_state=2020, verbose=False, depth=6, bagging_temperature=5, cat_features=['cluster']).fit(X, y).predict(X) # blended the Ridge and Catboost predictions. final_blend_2 = [x * 0.2 + y * 0.8 for x, y in zip(preds_ridge, preds_cat)] # Clipping the values from between 0 - 90 was also important as we know that the target variable is between 0 to 100. final_blend_2 = np.clip(final_blend_2, a_min=0, a_max=90) # Applying regularization to the final blend by substracting a constant from the predictions and clipping again. exp = final_blend_2 - 0.48 exp = np.clip(exp, a_min=0, a_max=90) ## Retraining predictions # Retraining on the test data by using the prediction of the stacked regressors as our target. # We also added the clusters but had to manually mean encode the clusters to the target variable as LinearRegression cannot encode categorical variables. X = tes.copy() X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X) X['target'] = exp X['encoded'] = X['cluster'].map(X.groupby('cluster')['target'].mean()) y = X.target X = X.drop(['cluster', 'target'], 1) preds_1 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.7 + LinearRegression().fit(X, y).predict(X) * 0.3 preds_2 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.5 + LinearRegression().fit(X, y).predict(X) * 0.5 preds_3 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.6 + LinearRegression().fit(X, y).predict(X) * 0.4 final = [ x * 0.3 + y * 0.3 + z * 0.4 for x, y, z in zip(preds_1, preds_2, preds_3) ] ## Further retraining of predictions # Retraining again this time using Regularized Greedy Forests and Catboost. X['final'] = final y = X.final X = X.drop('final', 1) preds_1 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.7 + RGFRegressor().fit(X, y).predict(X) * 0.3 preds_2 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.5 + RGFRegressor().fit(X, y).predict(X) * 0.5 preds_3 = CatBoostRegressor(verbose=False, random_state=2020).fit( X, y).predict(X) * 0.6 + RGFRegressor().fit(X, y).predict(X) * 0.4 final2 = [ x * 0.3 + y * 0.3 + z * 0.4 for x, y, z in zip(preds_1, preds_2, preds_3) ] return final2