Esempio n. 1
0
def compute(train_X, train_Y, test_X, test_Y):
    # CV parameters
    cv_sets = KFold(n_splits=10, shuffle=True, random_state=1)

    # CV for XGBoost
    param_grid = {
        "n_estimators": np.arange(30, 59, 2),  # 54, 88
        "learning_rate": np.linspace(0.2, 0.4, 3),  # 0.3, 0.1
        "min_child_weight": np.arange(2, 5),  # 3,3
        "max_depth": np.arange(4, 13, 2),  # 8,12
        "subsample": np.linspace(0.6, 0.8, 3),  # 0.7, 0.7
        "gamma": [0.01, 0.03, 0.1, 0.3]  # 0.1, 0.3
    }
    grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror',
                                            reg_lambda=0.,
                                            nthread=1),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    xgb_pred_Y = grid_cv.predict(test_X)

    # CV for LinXGBoost
    param_grid = {
        "learning_rate": [0.4, 0.5, 0.6],  # 0.7, 0.6
        "gamma": [0.3, 1, 3],  # 0.3, 0.6
        #"lbda": np.logspace(-4,-2,num=3), # -2, -3
        "min_samples_leaf": [4, 6, 8],  # 2, 2
    }
    grid_cv = GridSearchCV(linxgb(max_depth=200, n_estimators=5, lbda=0.),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    lin_pred_Y = grid_cv.predict(test_X)

    # CV for Random Forest
    param_grid = {
        "n_estimators": np.arange(20, 60, 3),  # 22, 65
        "min_samples_leaf": np.arange(1, 3),  # 1, 1
        "min_samples_split": np.arange(2, 4),  # 2, 2
        "max_depth": np.arange(10, 40, 3),  # 14, 26
    }
    grid_cv = GridSearchCV(RandomForestRegressor(random_state=1),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    rf_pred_Y = grid_cv.predict(test_X)

    return nmse(test_Y, xgb_pred_Y), nmse(test_Y,
                                          lin_pred_Y), nmse(test_Y, rf_pred_Y)
Esempio n. 2
0
def compute(train_X,train_Y,test_X,test_Y):
    # CV parameters
    cv_sets = KFold(n_splits=3, shuffle=True, random_state=1)

    # least-square fit
    reg = linear_model.LinearRegression()
    reg.fit(train_X,train_Y)
    lsf_pred_Y = reg.predict(test_X)

    # CV for XGBoost
    param_grid = { "n_estimators": np.arange(170,226,5), # 48
                   "learning_rate": [0.1,0.15,0.2], # 0.2
                   "min_child_weight": np.arange(2,4), # 5
                   "max_depth": np.arange(4,13,2), # 2
                   "subsample": np.linspace(0.7,0.9,3), # 0.6
                   "gamma": [ 0.03, 0.1, 0.3 ] # 0.1
                  }
    grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    xgb_pred_Y = grid_cv.predict(test_X)

    # CV for LinXGBoost
    param_grid = { #"learning_rate": [0.8,0.9],
                   "gamma": [ 1, 100, 10000, 1e6 ],
                   #"lbda": np.logspace(-11,-4,num=2),
                   "min_samples_leaf": [32,48],
                  }
    grid_cv = GridSearchCV(linxgb(max_depth=200,n_estimators=4,learning_rate=1.0,lbda=1e-11), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    lin_pred_Y = grid_cv.predict(test_X)

    # CV for Random Forest
    param_grid = { "n_estimators": np.arange(110,151,5), # 69 or 78
                   "min_samples_leaf": np.arange(1,3), # 1
                   "min_samples_split": np.arange(2,5), # 4 or 3
                   "max_depth": np.arange(16,29,2), # 24
                  }
    grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, iid=True, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    rf_pred_Y = grid_cv.predict(test_X)

    return nmse(test_Y,lsf_pred_Y), nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
Esempio n. 3
0
def compute(train_X,train_Y,test_X,test_Y):
    # CV parameters
    cv_sets = KFold(n_splits=3, shuffle=True, random_state=1)

    # least-square fit
    reg = linear_model.LinearRegression()
    reg.fit(train_X,train_Y)
    lsf_pred_Y = reg.predict(test_X)

    # CV for XGBoost
    param_grid = { "n_estimators": np.arange(130,166,5), # 48
                   "learning_rate": [0.05,0.10,0.15], # 0.2
                   "min_child_weight": np.arange(3,6), # 5
                   "max_depth": np.arange(4,11,2), # 2
                   "subsample": np.linspace(0.6,0.8,3), # 0.6
                   "gamma": [ 0.001, 0.003, 0.01 ] # 0.1
                  }
    grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    xgb_pred_Y = grid_cv.predict(test_X)

    # CV for LinXGBoost
    param_grid = { "learning_rate": [0.4,0.5], # 0.8
                   "gamma": [ 1, 30, 100 ], # 3 or 10
                   #"lbda": np.logspace(-13,-2,num=3), # -2
                   "min_samples_leaf": [16,24,32], #50
                  }
    grid_cv = GridSearchCV(linxgb(n_estimators=3,max_depth=200,lbda=0.), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    lin_pred_Y = grid_cv.predict(test_X)

    # CV for Random Forest
    param_grid = { "n_estimators": np.arange(80,121,5), # 69 or 78
                   "min_samples_leaf": np.arange(1,4), # 1
                   "min_samples_split": np.arange(2,5), # 4 or 3
                   "max_depth": np.arange(12,27,2), # 24
                  }
    grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    #reg = grid_cv.best_estimator_
    #reg.fit(train_X, train_Y)
    rf_pred_Y = grid_cv.predict(test_X)

    return nmse(test_Y,lsf_pred_Y), nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
Esempio n. 4
0
def compute(train_X,train_Y,test_X,test_Y):
    # CV parameters
    cv_sets = KFold(n_splits=10, shuffle=True, random_state=1)

    # CV for XGBoost
    param_grid = { "n_estimators": np.arange(20,52,3), # 28, 35
                   "learning_rate": np.linspace(0.3,0.6,4), # 0.2, 0.2
                   "min_child_weight": np.arange(1,6), # 2,4
                   "max_depth": np.arange(2,9,2), # 8,4
                   "subsample": np.linspace(0.7,1,4), # 1, 0.9
                   "gamma": [ 0.1, 0.3, 1 ] # 0.1, 0.3
                  }
    grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:linear', reg_lambda=0., nthread=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    xgb_pred_Y = reg.predict(test_X)

    # CV for LinXGBoost
    param_grid = { "n_estimators": [2,3],
                   "learning_rate": [0.8,0.9],
                   "gamma": [ 0.1, 0.3, 1, 3 ],
                   "lbda": np.logspace(-7,-1,num=4),
                   "min_samples_leaf": [3,4,8,16],
                  }
    grid_cv = GridSearchCV(linxgb(max_depth=200), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    lin_pred_Y = reg.predict(test_X)

    # CV for Random Forest
    param_grid = { "n_estimators": np.arange(10,40,4), # 53, 44
                   "min_samples_leaf": np.arange(1,4), # 1, 1
                   "min_samples_split": np.arange(2,5), # 2, 5
                   "max_depth": np.arange(2,13,2), # 16, 6
                  }
    grid_cv = GridSearchCV(RandomForestRegressor(random_state=1), param_grid, scoring='neg_mean_squared_error', cv=cv_sets, n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    rf_pred_Y = reg.predict(test_X)

    return nmse(test_Y,xgb_pred_Y), nmse(test_Y,lin_pred_Y), nmse(test_Y,rf_pred_Y)
Esempio n. 5
0
         'lambda_bias': 0.0, # L2 regularization term on bias, default 0
         'save_period': 0, # 0 means do not save any model except the final round model
         'nthread': 1,
         'subsample': subsample,
         'objective': 'reg:linear' # binary:logistic, reg:linear
         # 'eval_metric': the evaluation metric
         }
num_round = num_trees # the number of round to do boosting, the number of trees
bst2 = xgb.train(param, dtrain, num_round)

# LinXGBoost training
linbst = linxgb(n_estimators=num_trees,
                learning_rate=learning_rate,
                min_samples_leaf=min_samples_leaf,
                max_samples_linear_model=10000,
                max_depth=max_depth,
                subsample=subsample,
                lbda=0,
                gamma=gamma,
                prune=True,
                verbose=1)
linbst.fit(train_X, train_Y)

# Make predictions
xgb1_pred_Y = bst1.predict(dtest)
xgb2_pred_Y = bst2.predict(dtest)
lin_pred_Y = linbst.predict(test_X)

# Print scores
print("NMSE: XGBoost1 {:12.5f}, XGBoost2 {:12.5f}, LinXGBoost {:12.5f}". \
       format(nmse(test_Y,xgb1_pred_Y),
              nmse(test_Y,xgb2_pred_Y),
Esempio n. 6
0
def compute(train_X, train_Y, test_X, test_Y):
    # CV parameters
    cv_sets = KFold(n_splits=10, shuffle=True, random_state=1)

    # CV for XGBoost
    param_grid = {
        "n_estimators": np.arange(30, 55, 3),  # 48
        "learning_rate": np.linspace(0.2, 0.4, 3),  # 0.2
        "min_child_weight": np.arange(2, 6),  # 5
        "max_depth": np.arange(2, 9, 2),  # 2
        "subsample": np.linspace(0.6, 0.8, 3),  # 0.6
        "gamma": [0.03, 0.1, 0.3, 1]  # 0.1
    }
    grid_cv = GridSearchCV(xgb.XGBRegressor(objective='reg:linear',
                                            reg_lambda=0.,
                                            nthread=1),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    xgb_pred_Y = reg.predict(test_X)

    # CV for LinXGBoost
    param_grid = {
        "learning_rate": [0.6, 0.7, 0.8],  # 0.8
        "gamma": [1, 3, 10],  # 3 or 10
        #"lbda": np.logspace(-3,-1,num=3), # -2
        "min_samples_leaf": np.arange(40, 61, 5),  #50
        "n_estimators": [2, 3]
    }
    grid_cv = GridSearchCV(linxgb(max_depth=500, lbda=0.),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    lin_pred_Y = reg.predict(test_X)

    # CV for Random Forest
    param_grid = {
        "n_estimators": np.arange(30, 100, 4),  # 69 or 78
        "min_samples_leaf": np.arange(1, 3),  # 1
        "min_samples_split": np.arange(2, 7),  # 4 or 3
        "max_depth": np.arange(10, 31, 2),  # 24
    }
    grid_cv = GridSearchCV(RandomForestRegressor(random_state=1),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=cv_sets,
                           n_jobs=-1)
    grid_cv.fit(train_X, train_Y)
    reg = grid_cv.best_estimator_
    reg.fit(train_X, train_Y)
    rf_pred_Y = reg.predict(test_X)

    return nmse(test_Y, xgb_pred_Y), nmse(test_Y,
                                          lin_pred_Y), nmse(test_Y, rf_pred_Y)