def compute(train_X, train_Y, test_X, test_Y): # CV parameters cv_sets = KFold(n_splits=10, shuffle=True, random_state=1) # CV for XGBoost param_grid = { "n_estimators": np.arange(30, 59, 2), # 54, 88 "learning_rate": np.linspace(0.2, 0.4, 3), # 0.3, 0.1 "min_child_weight": np.arange(2, 5), # 3,3 "max_depth": np.arange(4, 13, 2), # 8,12 "subsample": np.linspace(0.6, 0.8, 3), # 0.7, 0.7 "gamma": [0.01, 0.03, 0.1, 0.3] # 0.1, 0.3 } grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) xgb_pred_Y = grid_cv.predict(test_X) # CV for LinXGBoost param_grid = { "learning_rate": [0.4, 0.5, 0.6], # 0.7, 0.6 "gamma": [0.3, 1, 3], # 0.3, 0.6 #"lbda": np.logspace(-4,-2,num=3), # -2, -3 "min_samples_leaf": [4, 6, 8], # 2, 2 } grid_cv = GridSearchCV(linxgb(max_depth=200, n_estimators=5, lbda=0.), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) lin_pred_Y = grid_cv.predict(test_X) # CV for Random Forest param_grid = { "n_estimators": np.arange(20, 60, 3), # 22, 65 "min_samples_leaf": np.arange(1, 3), # 1, 1 "min_samples_split": np.arange(2, 4), # 2, 2 "max_depth": np.arange(10, 40, 3), # 14, 26 } grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) rf_pred_Y = grid_cv.predict(test_X) return nmse(test_Y, xgb_pred_Y), nmse(test_Y, lin_pred_Y), nmse(test_Y, rf_pred_Y)
def compute(train_X,train_Y,test_X,test_Y): # CV parameters cv_sets = KFold(n_splits=3, shuffle=True, random_state=1) # least-square fit reg = linear_model.LinearRegression() reg.fit(train_X,train_Y) lsf_pred_Y = reg.predict(test_X) # CV for XGBoost param_grid = { "n_estimators": np.arange(170,226,5), # 48 "learning_rate": [0.1,0.15,0.2], # 0.2 "min_child_weight": np.arange(2,4), # 5 "max_depth": np.arange(4,13,2), # 2 "subsample": np.linspace(0.7,0.9,3), # 0.6 "gamma": [ 0.03, 0.1, 0.3 ] # 0.1 } grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) xgb_pred_Y = grid_cv.predict(test_X) # CV for LinXGBoost param_grid = { #"learning_rate": [0.8,0.9], "gamma": [ 1, 100, 10000, 1e6 ], #"lbda": np.logspace(-11,-4,num=2), "min_samples_leaf": [32,48], } grid_cv = GridSearchCV(linxgb(max_depth=200,n_estimators=4,learning_rate=1.0,lbda=1e-11), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) lin_pred_Y = grid_cv.predict(test_X) # CV for Random Forest param_grid = { "n_estimators": np.arange(110,151,5), # 69 or 78 "min_samples_leaf": np.arange(1,3), # 1 "min_samples_split": np.arange(2,5), # 4 or 3 "max_depth": np.arange(16,29,2), # 24 } grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) rf_pred_Y = grid_cv.predict(test_X) return nmse(test_Y,lsf_pred_Y), nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
def compute(train_X,train_Y,test_X,test_Y): # CV parameters cv_sets = KFold(n_splits=3, shuffle=True, random_state=1) # least-square fit reg = linear_model.LinearRegression() reg.fit(train_X,train_Y) lsf_pred_Y = reg.predict(test_X) # CV for XGBoost param_grid = { "n_estimators": np.arange(130,166,5), # 48 "learning_rate": [0.05,0.10,0.15], # 0.2 "min_child_weight": np.arange(3,6), # 5 "max_depth": np.arange(4,11,2), # 2 "subsample": np.linspace(0.6,0.8,3), # 0.6 "gamma": [ 0.001, 0.003, 0.01 ] # 0.1 } grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) xgb_pred_Y = grid_cv.predict(test_X) # CV for LinXGBoost param_grid = { "learning_rate": [0.4,0.5], # 0.8 "gamma": [ 1, 30, 100 ], # 3 or 10 #"lbda": np.logspace(-13,-2,num=3), # -2 "min_samples_leaf": [16,24,32], #50 } grid_cv = GridSearchCV(linxgb(n_estimators=3,max_depth=200,lbda=0.), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) lin_pred_Y = grid_cv.predict(test_X) # CV for Random Forest param_grid = { "n_estimators": np.arange(80,121,5), # 69 or 78 "min_samples_leaf": np.arange(1,4), # 1 "min_samples_split": np.arange(2,5), # 4 or 3 "max_depth": np.arange(12,27,2), # 24 } grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) #reg = grid_cv.best_estimator_ #reg.fit(train_X, train_Y) rf_pred_Y = grid_cv.predict(test_X) return nmse(test_Y,lsf_pred_Y), nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
def compute(train_X,train_Y,test_X,test_Y): # CV parameters cv_sets = KFold(n_splits=10, shuffle=True, random_state=1) # CV for XGBoost param_grid = { "n_estimators": np.arange(20,52,3), # 28, 35 "learning_rate": np.linspace(0.3,0.6,4), # 0.2, 0.2 "min_child_weight": np.arange(1,6), # 2,4 "max_depth": np.arange(2,9,2), # 8,4 "subsample": np.linspace(0.7,1,4), # 1, 0.9 "gamma": [ 0.1, 0.3, 1 ] # 0.1, 0.3 } grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:linear', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) xgb_pred_Y = reg.predict(test_X) # CV for LinXGBoost param_grid = { "n_estimators": [2,3], "learning_rate": [0.8,0.9], "gamma": [ 0.1, 0.3, 1, 3 ], "lbda": np.logspace(-7,-1,num=4), "min_samples_leaf": [3,4,8,16], } grid_cv = GridSearchCV(linxgb(max_depth=200), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) lin_pred_Y = reg.predict(test_X) # CV for Random Forest param_grid = { "n_estimators": np.arange(10,40,4), # 53, 44 "min_samples_leaf": np.arange(1,4), # 1, 1 "min_samples_split": np.arange(2,5), # 2, 5 "max_depth": np.arange(2,13,2), # 16, 6 } grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) rf_pred_Y = reg.predict(test_X) return nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
'lambda_bias': 0.0, # L2 regularization term on bias, default 0 'save_period': 0, # 0 means do not save any model except the final round model 'nthread': 1, 'subsample': subsample, 'objective': 'reg:linear' # binary:logistic, reg:linear # 'eval_metric': the evaluation metric } num_round = num_trees # the number of round to do boosting, the number of trees bst2 = xgb.train(param, dtrain, num_round) # LinXGBoost training linbst = linxgb(n_estimators=num_trees, learning_rate=learning_rate, min_samples_leaf=min_samples_leaf, max_samples_linear_model=10000, max_depth=max_depth, subsample=subsample, lbda=0, gamma=gamma, prune=True, verbose=1) linbst.fit(train_X, train_Y) # Make predictions xgb1_pred_Y = bst1.predict(dtest) xgb2_pred_Y = bst2.predict(dtest) lin_pred_Y = linbst.predict(test_X) # Print scores print("NMSE: XGBoost1 {:12.5f}, XGBoost2 {:12.5f}, LinXGBoost {:12.5f}". \ format(nmse(test_Y,xgb1_pred_Y), nmse(test_Y,xgb2_pred_Y),
def compute(train_X, train_Y, test_X, test_Y): # CV parameters cv_sets = KFold(n_splits=10, shuffle=True, random_state=1) # CV for XGBoost param_grid = { "n_estimators": np.arange(30, 55, 3), # 48 "learning_rate": np.linspace(0.2, 0.4, 3), # 0.2 "min_child_weight": np.arange(2, 6), # 5 "max_depth": np.arange(2, 9, 2), # 2 "subsample": np.linspace(0.6, 0.8, 3), # 0.6 "gamma": [0.03, 0.1, 0.3, 1] # 0.1 } grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:linear', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) xgb_pred_Y = reg.predict(test_X) # CV for LinXGBoost param_grid = { "learning_rate": [0.6, 0.7, 0.8], # 0.8 "gamma": [1, 3, 10], # 3 or 10 #"lbda": np.logspace(-3,-1,num=3), # -2 "min_samples_leaf": np.arange(40, 61, 5), #50 "n_estimators": [2, 3] } grid_cv = GridSearchCV(linxgb(max_depth=500, lbda=0.), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) lin_pred_Y = reg.predict(test_X) # CV for Random Forest param_grid = { "n_estimators": np.arange(30, 100, 4), # 69 or 78 "min_samples_leaf": np.arange(1, 3), # 1 "min_samples_split": np.arange(2, 7), # 4 or 3 "max_depth": np.arange(10, 31, 2), # 24 } grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1) grid_cv.fit(train_X, train_Y) reg = grid_cv.best_estimator_ reg.fit(train_X, train_Y) rf_pred_Y = reg.predict(test_X) return nmse(test_Y, xgb_pred_Y), nmse(test_Y, lin_pred_Y), nmse(test_Y, rf_pred_Y)