def __init__(self):
     self.clf = sklearn.XGBRegressor(max_depth=3,
                                     learning_rate=0.1,
                                     n_estimators=300,
                                     silent=True,
                                     objective=obj,
                                     gamma=0,
                                     min_child_weight=1,
                                     max_delta_step=0,
                                     subsample=1,
                                     colsample_bytree=1,
                                     colsample_bylevel=0.25,
                                     reg_alpha=0,
                                     reg_lambda=0.5,
                                     scale_pos_weight=1,
                                     base_score=0.5,
                                     seed=0,
                                     missing=None)
     self.w = np.array([
         np.array([
             1.7029e-02, 1.3079e-01, 6.1581e-02, -1.6783e-02, 3.3474e-02,
             -2.2277e-02, -2.1690e-01, 1.1374e-01, 7.1316e-02, 3.6111e-02,
             -1.9211e-01, 8.9843e-02, 1.0525e-02, -8.8967e-02, -1.6134e-01,
             -1.0343e-01, 3.8159e-02, 1.2840e-02, 1.4358e-01, -1.2254e-01,
             1.4967e-01, 3.8851e-02, 8.4922e-02, 2.1995e-02, -1.7713e-01,
             4.5296e-02, 5.0263e-02, 3.5791e-05, -1.4180e-01, 1.5155e-01,
             -7.8438e-02, -1.0855e-01, -1.0028e-01, -5.2810e-02, 7.0936e-02,
             8.6607e-02, 6.8758e-02, -1.7710e-01, 3.1382e-02, 2.7970e-01,
             3.8615e-01, 2.0975e-01, 1.1192e-02, -3.1998e-01, 1.9952e-01,
             4.5477e-01, -6.7926e-02, -1.2770e-01, 8.1820e-02, 1.7651e-01,
             3.3767e-02, 3.8274e-01, 8.7390e-03, -4.5134e-02, -5.6199e-02,
             -8.8637e-02, 7.9332e-02, -1.0147e-01, 1.7228e-01, -6.2791e-02,
             2.2888e-03, 5.2206e-02, 1.0851e-01, 3.7676e-02, 1.0128e-01,
             1.0922e-02, -1.9359e-01, 6.2475e-02, -5.5140e-02, 2.9518e-02,
             -2.3585e-02, -1.1021e-01, 1.2358e-01, 3.9869e-03, -3.0878e-02,
             -2.9022e-02, -2.5127e-02, -5.1951e-02, 6.4713e-02, 6.3186e-02,
             4.3845e-02, -3.2788e-02, 8.0593e-03, 6.9834e-02, -5.3207e-02,
             8.0649e-02, -7.0133e-02, -1.1874e-01, -2.0268e-01, 3.6341e-02,
             -2.8456e-02, 2.5505e-01, -5.9185e-02, -1.6351e-01, 2.0862e-01,
             3.9112e-01, -1.7588e-02, 3.9111e-02, 2.9766e-01, 5.3394e-01,
             -4.8566e-03, 6.3414e-02, 2.7350e-01, -1.6731e-01, -2.6914e-02,
             -1.9693e-01, 1.4585e-01, 4.4899e-02, -3.2440e-02, 4.4213e-02,
             1.1280e-01, 2.1263e-01, 1.1246e-01, -5.3757e-02, -1.4070e-01,
             8.6012e-02, -1.2140e-01, 7.1008e-04, 1.3947e-02, -2.5169e-02,
             1.7305e-01, -3.6080e-02, -6.7890e-02, 9.9060e-02, 4.4189e-02,
             -1.1350e-01, 1.4912e-01, 3.4591e-02, 5.1782e-02, 1.5098e-02,
             8.5624e-03, -1.0366e-01, -6.0745e-02, 1.7117e-01, -5.4439e-02,
             -1.2122e-01, -2.8721e-01, -2.1258e-01, 3.5069e-02, 8.1284e-02,
             -2.1620e-01, -3.0161e-01
         ]) for i in range(n_group)
     ])
     self.x = None
     self.y = None
     self.loss = lambda w: loss(w, self.x, self.y)
     self.grad = lambda w: grad(w, self.x, self.y)
     self.select_loss = lambda i: (lambda w: loss(w, self.x[i], self.y[i]))
     self.select_grad = lambda i: (lambda w: grad(w, self.x[i], self.y[i]))
 def __init__(self):
     self.clf = sklearn.XGBRegressor(max_depth=3,
                                     learning_rate=0.1,
                                     n_estimators=300,
                                     silent=True,
                                     objective=obj,
                                     gamma=0,
                                     min_child_weight=1,
                                     max_delta_step=0,
                                     subsample=1,
                                     colsample_bytree=1,
                                     colsample_bylevel=0.25,
                                     reg_alpha=1,
                                     reg_lambda=0.5,
                                     scale_pos_weight=1,
                                     base_score=0.5,
                                     seed=0,
                                     missing=None)
     self.w = np.array([
         3.0521e-02, 3.3850e-03, -2.7892e-02, 9.4246e-02, 1.2712e-01,
         5.6794e-02, 1.9702e-01, 3.0102e-02, 8.1020e-02, 2.2443e-03,
         -3.6303e-02, -9.9930e-03, -7.2356e-03, -6.6374e-03, 7.2554e-02,
         -1.0639e-02, -8.9164e-02, -7.6698e-02, -7.3221e-02, -2.6325e-02,
         1.5297e-02, -6.1099e-03, -1.6564e-02, 1.1742e-03, -7.7687e-03,
         -4.0734e-02, 3.5347e-02, -8.9857e-03, -1.0205e-02, -3.5139e-02,
         8.7736e-03, -2.6164e-02, -7.4057e-04, 6.9800e-02, 5.1630e-02,
         8.2260e-02, -4.3334e-02, 9.5439e-02, 3.8949e-02, 2.7576e-02,
         -2.7300e-02, -1.9236e-02, 1.3960e-02, -9.1715e-02, -8.0246e-02,
         1.6001e-01, -1.4912e-01, -1.1418e-01, -1.3520e-01, 5.8030e-02,
         1.8183e-01, -3.1726e-02, -7.4795e-02, -5.3430e-02, -4.1667e-02,
         2.4433e-02, -1.5640e-02, -2.0981e-02, 4.8331e-03, -2.2744e-02,
         2.1778e-02, -9.1474e-03, -2.7065e-02, -1.3960e-03, 3.1320e-02,
         2.4609e-02, 2.7434e-02, 1.4061e-02, -3.9493e-03, 1.7370e-02,
         5.4428e-03, 4.9994e-03, 1.1100e-02, 1.3571e-02, 2.6117e-03,
         3.6254e-03, 1.2581e-02, 2.2057e-02, -1.5871e-02, 1.3411e-02,
         -1.6218e-02, -4.9300e-02, -4.8487e-02, -6.6901e-02, -1.9708e-02,
         -3.6207e-02, 2.7848e-02, 3.3245e-02, -2.5913e-02, 4.8864e-02,
         1.7982e-02, 7.2035e-02, 9.8399e-03, -1.2854e-01, 1.2498e-01,
         2.5496e-01, 4.8815e-01, 1.2856e-02, 2.7124e-02, -1.1177e-01,
         -6.9739e-02, -7.9357e-02, -1.3767e-01, -3.4607e-02, -9.0663e-02,
         2.0239e-03, 6.8687e-02, -2.8339e-02, -2.3041e-02, 7.7071e-03,
         -4.1781e-02, 3.0516e-02, 3.4045e-02, 5.5087e-02, 5.4454e-02,
         1.6309e-02, 1.5335e-03, 1.3867e-02, 1.8400e-02, 3.6903e-03,
         2.1292e-02, 3.8298e-02, -3.4507e-02, 2.0960e-03, 3.4506e-03,
         1.3975e-02, -2.4490e-02, 2.9441e-02, -2.5951e-02, 1.5139e-02,
         -4.7242e-02, -1.0273e-01, 8.0461e-03, -6.2661e-02, 2.7466e-02,
         -4.3963e-03, -4.4565e-02, 1.3144e-02, -7.3661e-02, 5.3355e-02,
         -3.5869e-03, -5.7825e-02, 1.8184e-01, 3.0521e-01, 4.2624e-01
     ])
     self.x = None
     self.y = None
     self.loss = lambda w: loss(w, self.x, self.y)
     self.grad = lambda w: grad(w, self.x, self.y)
Beispiel #3
0
def _xgboost_gridsearch_model(
    task,
    numeric_features,
    categoric_features,
    learning_rate,
    use_dask,
    n_iter,
    scoring,
):
    param_space = {
        'clf__max_depth': randint(2, 11),
        'clf__min_child_weight': randint(1, 11),
        'clf__subsample': uniform(0.5, 0.5),
        'clf__colsample_bytree': uniform(0.5, 0.5),
        'clf__colsample_bylevel': uniform(0.5, 0.5),
        'clf__gamma': uniform(0, 1),
        'clf__reg_alpha': uniform(0, 1),
        'clf__reg_lambda': uniform(0, 10),
        'clf__base_score': uniform(0.1, 0.9),
        'clf__scale_pos_weight': uniform(0.1, 9.9),
    }

    model = (xgbsk.XGBClassifier(learning_rate=learning_rate)
             if task == 'classification' else xgbsk.XGBRegressor(
                 learning_rate=learning_rate))

    pipe = Pipeline([
        (
            'preprocessing',
            simple_proc_for_tree_algoritms(numeric_features,
                                           categoric_features),
        ),
        ('clf', model),
    ])

    if use_dask:
        from dask_ml.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
    else:
        from sklearn.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
Beispiel #4
0
 def __init__(self):
     self.clf = sklearn.XGBRegressor(max_depth=3,
                                     learning_rate=0.1,
                                     n_estimators=200,
                                     silent=True,
                                     objective='reg:linear',
                                     gamma=0,
                                     min_child_weight=1,
                                     max_delta_step=0,
                                     subsample=1,
                                     colsample_bytree=1,
                                     colsample_bylevel=0.25,
                                     reg_alpha=0,
                                     reg_lambda=0.5,
                                     scale_pos_weight=1,
                                     base_score=0.5,
                                     seed=0,
                                     missing=None)
Beispiel #5
0
def train_save(pred_period=20, is_high=True, is_clf=False):

    data = gen_dataset(is_high=is_high, is_clf=is_clf, pred_period=pred_period)

    if is_clf:
        _, y_train = data["train"]
        scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

    if not is_clf:
        models = [
            lgbm.LGBMRegressor(n_estimators=300,
                               num_leaves=100,
                               max_depth=8,
                               random_state=0),
            xgb.XGBRegressor(n_estimators=300, max_depth=5, random_state=0)
        ]
    else:
        models = [
            lgbm.LGBMClassifier(n_estimators=300,
                                scale_pos_weight=0.1,
                                num_leaves=100,
                                max_depth=8,
                                random_state=0),
            xgb.XGBClassifier(
                n_estimators=300,
                scale_pos_weight=0.1,
                max_depth=5,
                random_state=0,
            )
        ]
    y_pred_list = train(data, models, is_clf=is_clf)

    # save model
    for model in models:
        save_model(model, pred_period, is_high)

    return y_pred_list
Beispiel #6
0
    smape = metric(pred,test_y)
    return model,smape
"""
#param = {"learning_rate":0.1,"n_estimators":1000,"max_depth":5,
# "min_child_weight":1,"gamma":0,"subsample":1,"colsample_bytree":1,
# "objective":'reg:linear',"nthread":4,"scale_pos_weight":1,"seed":27}
param = {
    "learning_rate": 0.8,
    "gamma": 0,
    "subsample": 1,
    "colsample_bytree": 1,
    "max_depth": 5,
    "objective": 'reg:linear',
    "seed": 27
}
model = sklearn.XGBRegressor(**param)
"""
param_cv_1 = {"learning_rate":[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15]}
param_cv_2 = {"n_estimators":[int(x) for x in np.linspace(100,2000,20)]}
param_cv_3 = {"max_depth":[3,4,5,6,7,8]}
param_cv_4 = {"min_child_weight":[0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5]}
param_cv_5 = {"scale_pos_weight":[0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5]}

def choose_best_param(model,param_cv,data_x,data_y):
    clf = GridSearchCV(estimator=model,param_grid=param_cv,error_score=SMAPE)
    clf.fit(data_x,data_y)
    return clf.best_params_


param_all = [param_cv_1,param_cv_2,param_cv_3,param_cv_4,param_cv_5]
best_param = param.copy()
import time
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, \
    AdaBoostRegressor, RandomForestRegressor

boston = load_boston()
X = boston.data
y = boston.target

# Make a validation set
X_train, X_validation, y_train, y_validation = train_test_split(X,
                                                                y,
                                                                random_state=1848)
# Sci-Kit Learn's Out of the Box Gradient Tree Implementation
sklearn_boost = GradientBoostingRegressor(random_state=1849)
t1 = time.time()
sklearn_boost.fit(X_train, y_train.ravel())
print('Training Error: {:.3f}'.format(1 - sklearn_boost.score(X_train,
                                                              y_train)))
print('Validation Error: {:.3f}'.format(1 - sklearn_boost.score(X_validation,
                                                                y_validation)))
# %timeit sklearn_boost.fit(X_train, y_train.ravel()) # ipython语句,用于测试该语句运行的时间
# XGBoost
xgb_boost = xgb.XGBRegressor(seed=1850)
xgb_boost.fit(X_train, y_train.ravel())
print('Training Error: {:.3f}'.format(1 - xgb_boost.score(X_train,
                                                          y_train)))
print('Validation Error: {:.3f}'.format(1 - xgb_boost.score(X_validation,
                                                            y_validation)))
# %timeit xgb_boost.fit(X_train, y_train.ravel())
Beispiel #8
0
# Inspect data
percent_missing = X_test.isnull().sum() * 100 / len(X_test)
missing_value_df = pd.DataFrame({
    'column_name': X_test.columns,
    'percent_missing': percent_missing
})
missing_value_df.sort_values('percent_missing', inplace=True)
# X_train is missing 3%-10% of the values

# Create pipeline
pipe = Pipeline([
    # the scale stage is populated by the param_grid
    ('impute', SimpleImputer()),
    ('scale', 'passthrough'),
    ('selection', SelectKBest(f_regression)),
    ('estimation', xgb.XGBRegressor())
])

# Specify parameters to be searched over
param_grid = [{
    'scale': [RobustScaler()],  # StandardScaler(),Normalizer()
    'impute__strategy': ['mean'],  # , 'median'
    'selection__k': [100],
    'estimation__max_depth': [5],
    'estimation__min_child_weight': [1],
    'estimation__gamma': [0],
    'estimation__subsample': [0.8],
    'estimation__colsample_bytree': [0.8]
}]
# Gridsearch
search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, scoring='r2')