Esempio n. 1
0
    def fit_xgboost_regression(self):
        if (self.X_val is not None):
            X_train_aux = pd.concat([pd.DataFrame(self.X_train.copy()), pd.DataFrame(self.X_val.copy())])
            y_train_aux = pd.Series(
                pd.concat([pd.DataFrame(self.y_train.copy()), pd.DataFrame(self.y_val.copy())]).values.reshape(
                    self.y_train.shape[0] + self.y_val.shape[0], ))
        else:
            X_train_aux = self.X_train
            y_train_aux = self.y_train



        xgbreg = XGBRegressor(nthreads=-1)
        params = {
            "max_depth": [i for i in range(5,55,5)],
            "learning_rate": [0.001,0.01,0.1],
            "gamma": [i for i in range(1,20)],
            "n_estimators": [i * 10 for i in range(5, 55, 5)]
        }
        self.gs_xgboost = RandomizedSearchCV(xgbreg, params, n_jobs=-1,verbose=2)
        self.gs_xgboost.fit(X_train_aux, y_train_aux)

        self.xgboost_reg_model = self.gs_xgboost.best_estimator_
        #self.xgboost_reg_model = self.xgboost_reg_model.fit(X_train_aux, y_train_aux)

        return self.xgboost_reg_model
Esempio n. 2
0
def generate_XGB_model(train_df):
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    '''print 'Train And Fix Missing Age Value...'
    train_df, xgb_age = train_model_for_age(train_df)
    joblib.dump(xgb_age, 'XGB_age.model')'''
    train_df.drop(['marriageStatus', 'haveBaby', 'sitesetID', 'positionType'],
                  axis=1,
                  inplace=True)
    print 'Done'
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:, 0]
    X = train_np[:, 1:]
    print 'Train Xgboost Model...'
    start_time = datetime.datetime.now()
    xbg_clf = XGBRegressor(n_estimators=100,
                           max_depth=6,
                           objective="binary:logistic",
                           silent=False)
    xbg_clf.fit(X, y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
    model_df = pd.DataFrame({
        'columns': list(train_df.columns)[1:],
        'values': xbg_clf.feature_importances_
    })
    print model_df
    return xbg_clf
def FI_xgb_sklearn():
    X, y = load_traindata(encodetype='le')
    cols = list(X.columns)

    rndcol = np.random.randn(X.shape[0])
    X = np.column_stack((X, rndcol))
    cols.append('random')

    xgb1 = XGBRegressor(learning_rate=0.01,
                        n_estimators=3320,
                        max_depth=3,
                        min_child_weight=4,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        importance_type='total_gain',
                        objective='reg:linear',
                        n_jobs=-1,
                        random_state=0,
                        seed=27,
                        silent=True)

    xgb1.fit(X, y)

    imp = sorted(list(zip(cols, xgb1.feature_importances_)),
                 key=lambda t: abs(t[1]),
                 reverse=True)
    imp = pd.DataFrame(imp, columns=['Feature', 'Importance'])
    rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0]
    print(imp.iloc[:rnd_idx + 1, :])
    return imp
Esempio n. 4
0
    def skl_cv(self):
        logging.info("{0}:正在进行网格搜索".format(self.now_time()))
        if self.model == 'C':
            grid_search = GridSearchCV(estimator=self.rf,
                                       param_grid=self.cv_param,
                                       scoring='accuracy')
            grid_search.fit(self.X_train, self.Y_train)
            logging.info("{0}:最优参数:{1}".format(self.now_time(),
                                               grid_search.best_params_))
            logging.info("{0}:最优参数acc结果:{1}".format(self.now_time(),
                                                    grid_search.best_score_))
            self.rf = XGBClassifier(
                n_estimators=grid_search.best_params_['n_estimators'],
                max_depth=grid_search.best_params_['max_depth'],
                min_child_weight=grid_search.best_params_['min_child_weight'],
                gamma=grid_search.best_params_['gamma'],
                learning_rate=grid_search.best_params_['learning_rate'])

        elif self.model == 'R':
            grid_search = GridSearchCV(estimator=self.rf,
                                       param_grid=self.cv_param,
                                       scoring='neg_mean_absolute_error')

            grid_search.fit(self.X_train, self.Y_train)
            logging.info("{0}:最优参数:{1}".format(self.now_time(),
                                               grid_search.best_params_))
            logging.info("{0}:最优参数R平方结果:{1}".format(self.now_time(),
                                                    grid_search.best_score_))
            self.rf = XGBRegressor(
                n_estimators=grid_search.best_params_['n_estimators'],
                max_depth=grid_search.best_params_['max_depth'],
                min_child_weight=grid_search.best_params_['min_child_weight'],
                gamma=grid_search.best_params_['gamma'],
                learning_rate=grid_search.best_params_['learning_rate'])
    def fit_model(self, data, target, test):
        clf = XGBRegressor(learning_rate=self.learning_rate,
                           n_estimators=self.n_estimators,
                           max_depth=self.max_depth,
                           min_child_weight=self.min_child_weight,
                           gamma=self.gamma,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           objective=self.objective,
                           nthread=self.nthread,
                           scale_pos_weight=self.scale_pos_weight,
                           reg_alpha=self.reg_alpha,
                           reg_lambda=self.reg_lambda,
                           seed=self.seed)
        data = np.array(data).astype(float)
        scaler = MinMaxScaler()
        temp = scaler.fit(data)
        data = scaler.transform(data)
        test = scaler.transform(test)
        target = scaler.fit_transform(target)

        clf.fit(data, target)
        new_feature = clf.apply(data)
        new_test = clf.apply(test)
        X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature)
        X_test_new = self.mergeToOne(pd.DataFrame(test), new_test)
        X_train_new = pd.DataFrame(X_train_new)
        X_test_new = pd.DataFrame(X_test_new)
        return X_train_new, target, X_test_new
Esempio n. 6
0
 def __init__(self,
              mean_model_params={},
              upper_quantile_params={
                  'alpha': 0.95,
                  'delta': 1.0,
                  'thresh': 1.0,
                  'variance': 1.0
              },
              lower_quantile_params={
                  'alpha': 0.05,
                  'delta': 1.0,
                  'thresh': 1.0,
                  'variance': 1.0
              }):
     self.mean_model_params = mean_model_params
     self.upper_quantile_params = upper_quantile_params
     self.lower_quantile_params = lower_quantile_params
     self.gb = XGBRegressor(**mean_model_params)
     mean_model_params.pop('alpha', None)
     upper_quantile_params_combined = {**mean_model_params}
     upper_quantile_params_combined.update(upper_quantile_params)
     lower_quantile_params_combiled = {**mean_model_params}
     lower_quantile_params_combiled.update(lower_quantile_params)
     self.gb_quantile_upper = XGBQuantileRegressor(
         **upper_quantile_params_combined)
     self.gb_quantile_lower = XGBQuantileRegressor(
         **lower_quantile_params_combiled)
     self.upper_alpha = upper_quantile_params['alpha']
     self.lower_alpha = lower_quantile_params['alpha']
Esempio n. 7
0
File: base.py Progetto: sbkhosh/dax
def set_grid_search(regrs, X_train, y_train, reg):
    if (regrs == 'tree'):
        random_grid = build_grid_tree()
        prms = grid_search(reg, X_train, y_train, random_grid)
        reg_prms = DecisionTreeRegressor(max_features=prms['max_features'], max_depth=prms['max_depth'], \
                                         min_samples_split=prms['min_samples_split'], max_leaf_nodes=prms['max_leaf_nodes'], \
                                         min_samples_leaf=prms['min_samples_leaf'])
    elif (regrs == 'forest'):
        random_grid = build_grid_rf()
        prms = grid_search(reg, X_train, y_train, random_grid)
        reg_prms = RandomForestRegressor(n_estimators=prms['n_estimators'],max_features=prms['max_features'], \
                                         max_depth=prms['max_depth'], min_samples_split=prms['min_samples_split'], \
                                         min_samples_leaf=prms['min_samples_leaf'], n_jobs=-1)
    elif (regrs == 'xgbr'):
        random_grid = build_grid_xgbr()
        prms = grid_search(reg, X_train, y_train, random_grid)
        reg_prms = XGBRegressor(learning_rate=prms['learning_rate'], max_depth=prms['max_depth'], \
                                min_child_weight=prms['min_child_weight'], n_estimators=prms['n_estimators'],\
                                subsample=prms['subsample'], n_jobs=-1)
    elif (regrs == 'nn'):
        random_grid = build_grid_nn()
        prms = grid_search(reg, X_train, y_train, random_grid)
        reg_prms = MLPRegressor(hidden_layer_sizes=prms['hidden_layer_sizes'],activation=prms['activation'],solver=prms['solver'],\
                                alpha=prms['alpha'],learning_rate_init=prms['learning_rate_init'],learning_rate=prms['learning_rate'],\
                                max_iter=prms['max_iter'],tol=prms['tol'],momentum=prms['momentum'],beta_1=prms['beta_1'],\
                                beta_2=prms['beta_2'],n_iter_no_change=prms['n_iter_no_change'])
    return (reg_prms)
Esempio n. 8
0
    def train(self, X_train, X_test, y_train, y_test):
        '''
        Trains the machine learning model based on the dataframe provided as input.
        The fitted model will be saved under model/xgboost.pkl
        The function returns the MSE and the RMSE
        :param df:
        :return: RMSE and MAE scores
        '''
        print('Training is starting...')
        eval_set = [(X_train, y_train), (X_test, y_test)]

        self.model = XGBRegressor(max_depth=7,
                                  objective='reg:squarederror',
                                  gamma=0,
                                  learning_rate=0.03,
                                  subsample=1,
                                  colsample_bytree=0.9,
                                  min_child_weight=10)

        self.model.fit(X_train,
                       y_train,
                       eval_set=eval_set,
                       eval_metric="rmse",
                       early_stopping_rounds=500)

        predictions = self.predict(X_test)

        with open('generated/gxboost_model.pickle', 'wb') as file:
            pickle.dump(self.model, file)

        self.evaluate(y_test, X_test)
Esempio n. 9
0
    def grid_search(self, X_train, X_test, y_train, y_test):
        grid_param = {
            'max_depth': [n for n in range(2, 10)],
            'gamma': np.arange(0, 0.5, 0.1),
            'learning_rate': [0.0001, 0.001, 0.01, 0.1],
            'subsample': np.arange(0.5, 0.9, 0.1),
            'colsample_bytree': np.arange(0.5, 0.9, 0.1),
            'min_child_weight': [1, 3, 5, 7]
        }

        model = XGBRegressor(max_depth=7,
                             objective='reg:squarederror',
                             gamma=0,
                             learning_rate=0.03,
                             subsample=1,
                             colsample_bytree=0.9,
                             min_child_weight=10)

        gd_sr = GridSearchCV(estimator=model,
                             param_grid=grid_param,
                             scoring='neg_mean_squared_error',
                             cv=5,
                             n_jobs=-1)

        gd_sr.fit(X_train, y_train)

        best_parameters = gd_sr.best_params_
        print(best_parameters)
Esempio n. 10
0
def xgbregressor(xtrain, y_train, x_test):
    
  xgb_reg = XGBRegressor()
  parameters = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.07], 
              'max_depth': [7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [12]}

  clf_xgbreg = GridSearchCV(xgb_reg,
                        parameters,
                        n_jobs = 5,
                        cv = 2,
                        verbose=True)

  clf_xgbreg.fit(x_train,y_train)
  
  #print(clf_xgbreg.best_params_)
  
  #values_to_predict = y_train
  preds = clf_RF.predict(x_train)
  
  y_test_pred = clf_xgbreg.predict(x_test)
  
  print(y_test_pred)
  
  print(pd.DataFrame(y_test_pred).describe())
  
  return preds
Esempio n. 11
0
def over_sample(train, test, feat):
    predictors = [x for x in train.columns if x not in ['ID', 'y']]
    groups = list(train[feat].unique())
    result = None
    for name in groups:
        train_temp = pd.concat([train, train[train[feat] == name]])
        test_temp = test[test[feat] == name]
        model = XGBRegressor(max_depth=4,
                             learning_rate=0.0045,
                             n_estimators=1250,
                             silent=True,
                             objective='reg:linear',
                             nthread=-1,
                             min_child_weight=1,
                             max_delta_step=0,
                             subsample=0.93,
                             seed=27)
        model.fit(train_temp[predictors], train_temp['y'])
        pred = model.predict(test_temp[predictors])
        if result is None:
            result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred})
        else:
            result = pd.concat([
                result,
                pd.DataFrame({
                    'ID': test_temp['ID'].values,
                    'y': pred
                })
            ])
    result.sort_values('ID', inplace=True)

    return result
Esempio n. 12
0
def get_estimator():

    drop_cols = [
        'CODGEO', 'LIBGEO', 'REG', 'DEP', 'Code Nuance', 'Code du département'
    ]
    base_cols = [
        'Orientation Economique', 'SEG Croissance POP', 'Urbanité Ruralité',
        'Dynamique Démographique BV', 'Environnement Démographique',
        'Fidélité', 'SYN MEDICAL', 'Seg Dyn Entre',
        'SEG Environnement Démographique Obsolète', 'Seg Cap Fiscale',
        'DYN SetC', 'CP', 'MED14', 'Nb Femme', 'Nb Homme'
    ]

    base_transformer = FunctionTransformer(_preprocessor, validate=False)

    base_transformer = make_pipeline(base_transformer,
                                     SimpleImputer(strategy='most_frequent'))

    preprocessor = ColumnTransformer(
        transformers=[
            ('base', base_transformer, base_cols),
            ('drop cols', 'drop', drop_cols),
        ],
        remainder='passthrough')  # remainder='drop' or 'passthrough'

    regressor = XGBRegressor()

    pipeline = Pipeline(steps=[('preprocessing',
                                preprocessor), ('Regressor', regressor)])

    return pipeline
Esempio n. 13
0
    def model_xgb_search(self, X, Y):
        # train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.1, random_state=0)  # 分训练集和验证集
        print('model_xgb_search start')
        xgb_model = XGBRegressor()

        # cv_split = ShuffleSplit(n_splits=5, train_size=0.7, test_size=0.2)
        # param_grid = dict(
        #     max_depth=[2],
        #     min_child_weight= [1, 2, 3, 4, 5, 6],
        #     gamma=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
        #     learning_rate=np.linspace(0.03, 1, 10),
        #     n_estimators=[50, 100, 200, 400],
        #     num_class=[2],
        #     objective=['multi:softmax']
        # )
        param_grid = dict(
            max_depth=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],  # 3
            learning_rate=np.linspace(0.03, 0.1, 5),
            n_estimators=[100, 200, 300],  # 200
            # num_class=[2],
            # objective=['multi:softmax']  # 'binary:logistic'
        )
        start = time.time()
        cv_split = StratifiedKFold(n_splits=5, shuffle=True)
        grid = GridSearchCV(xgb_model, param_grid, cv=cv_split)  # scoring='neg_log_loss'
        grid_result = grid.fit(X, Y)
        print("Best: %f using params: %s estimator: %s" % (
            grid_result.best_score_, grid_result.best_params_, grid_result.best_estimator_))
        print('GridSearchCV process use %.2f seconds' % (time.time() - start))
        print("Save model to " + self.model_path)
        dump(grid_result, self.model_path)
        print('end=======')
Esempio n. 14
0
 def xgboostmodel(self):
     df = pd.read_csv(datafile, encoding='utf-8', index_col=0)
     print(df.shape)
     traindata = df.iloc[:, :].values
     x = traindata[:, :-1]
     y = traindata[:, -1]
     x_train, x_test, y_train, y_test = train_test_split(
         x, y, train_size=0.7)  # list
     if self.params is None:
         params = {'max_depth': 80, 'n_estimators': 512}
     else:
         params = self.params
     raw_model = XGBRegressor(max_depth=128,
                              n_estimators=768,
                              learning_rate=0.01,
                              silence=False)
     raw_model.fit(x_train, y_train)
     raw_model.save_model(self.model_file)
     pred = raw_model.predict(x_test)
     self.true = y_test
     self.pred = pred
     self.show_save_figure(fig_path=self.fig_path,
                           modelname=self.job_name,
                           detal_idx=500)
     t_mean = self.cal_mean(self.true)
     p_mean = self.cal_mean(self.pred)
     self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
Esempio n. 15
0
 def fit_model_split(self, X_train, y_train, X_test, y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     # y_pre = clf.predict(X_train_2)
     # y_pro = clf.predict_proba(X_train_2)[:, 1]
     # print
     # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro)
     # print
     # "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre)
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set of sample size 0.4 fewer than before"
     return X_train_new2, y_train_2, X_test_new, y_test
Esempio n. 16
0
def model_intrv3(Y_train, X_train, Y_test, X_test, Targ):
    global reslts
    global metrs
    import pandas as pd
    import numpy as np
    import datetime as dt
    import sklearn
    from sklearn.metrics import mean_squared_error
    from xgboost.sklearn import XGBRegressor
    from sklearn.metrics import mean_squared_error
    model = XGBRegressor(n_estimators=200,
                         learning_rate=0.05,
                         max_depth=4,
                         random_state=0,
                         subsample=0.9,
                         colsample_bytree=1.0,
                         loss='ls').fit(X_train, Y_train)
    model.score(X_test, Y_test)

    pred_Yxgb = model.predict(X_test)
    mse = mean_squared_error(Y_test, pred_Yxgb)
    nRMSE = np.sqrt(mse) / Targ.mean()
    # nRMSE=np.sqrt(mse)/max(Targ)
    Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb}
    Yts_pd = pd.DataFrame(Yts_pd)
    print(mse, nRMSE)
    metrs = {'mse': mse, 'nRMSE': nRMSE}
    reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd}
    return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
Esempio n. 17
0
 def fit_model(self, X_train, y_train, X_test, y_test):
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train, y_train)
     # y_pre = clf.predict(X_test)
     # y_pro = clf.predict_proba(X_test)[:, 1]
     # print
     # "pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro)
     # print("pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set sample number remains the same"
     return X_train_new, y_train, X_test_new, y_test
def train_xgb(df_preprocessed, df_target):
    """
    Train an XGBoost Regressor on the data
    :param df_preprocessed: features
    :param df_target: target
    :return: a tuple of best estimator and best estimator score
    """
    xgb_reg = XGBRegressor(
        nthread=4,
        objective='reg:linear',
        learning_rate=0.02,  # so called `eta` value
        max_depth=10,
        min_child_weight=1,
        gamma=3,
        subsample=1.0,
        colsample_bytree=0.35)

    param_grid = {'n_estimators': [1000]}

    gridsearch_xgb = GridSearchCV(xgb_reg,
                                  param_grid,
                                  cv=3,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='neg_mean_squared_error')
    gridsearch_xgb.fit(df_preprocessed, df_target)

    # save the model to disk
    # xgb_filename = r'models\xgboost_model.sav'
    # pickle.dump(gridsearch_xgb, open(xgb_filename, 'wb'))
    print(np.sqrt(-gridsearch_xgb.best_score_))

    return gridsearch_xgb.best_estimator_, np.sqrt(-gridsearch_xgb.best_score_)
Esempio n. 19
0
    def def_model(self, parameters: dict = None):
        model = XGBRegressor()

        if parameters is not None:
            model.set_params(**parameters)

        self._model = model
Esempio n. 20
0
def XGB_reg_evaluation(individual, evaluation_method='roll_win'):
    '''
    evaluation_method : can be roll_win, mse
    '''

    if evaluation_method == 'roll_win':
        trainNumber = individual[6]  # the train num
        param = {
            'eta': individual[0],
            'silent': True,
            'objective': "reg:linear",
            'nthread': -1,
            'min_child_weight': individual[1],
            'max_depth': individual[2],
            'subsample': individual[3],
            'colsample_bylevel': individual[4],
            'seed': 0
        }
        roll_win_mseValue = 0
        for i in xrange(N_validation):
            trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\
                                          trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)]

            testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \
                                       trainY[(trainNum - (i + 1) * window):(trainNum - i * window)]
            dtrain = xgb.DMatrix(data=trainingX, label=trainingY)
            bst = xgb.train(params=param,
                            dtrain=dtrain,
                            num_boost_round=individual[5])
            testingX = xgb.DMatrix(testingX)
            roll_win_mseValue += sum(
                (testingY - bst.predict(testingX))**2) / window
        roll_win_mseValue /= N_validation
        return (roll_win_mseValue, )

    if evaluation_method == 'mse':
        ### The cross validation evaluation
        N_SPLITS = N_splits
        kf = KFold(n_splits=N_SPLITS)
        cv_mseValue = 0
        fc = XGBRegressor(learning_rate=individual[0],
                          n_estimators=individual[5],
                          silent=True,
                          objective="reg:linear",
                          nthread=-1,
                          gamma=0,
                          min_child_weight=individual[1],
                          max_depth=individual[2],
                          subsample=individual[3],
                          colsample_bylevel=individual[4],
                          seed=0)
        for train, test in kf.split(trainX):
            fc.fit(trainX[train, :], trainY[train])
            cv_mseValue += sum(
                (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test)
        cv_mseValue = cv_mseValue / N_SPLITS
        return (cv_mseValue, )

    print "There is no evaluation method for %s" % evaluation_method
    raise Exception("evaluation_method is not valid")
Esempio n. 21
0
    def xgboost_single_pred(self):

        x_train = self.x_train
        y_train = self.y_train

        x_test = self.x_test
        y_test = self.y_test

        self.y_pred_all_xgb = []
        y_train = list(y_train)
        xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75)

        for i in range(len(x_test)):
            xgboost_clf.fit(x_train, y_train)
            x_test_one = x_test.iloc[i:i + 1]
            y_test_one = xgboost_clf.predict(x_test_one)
            self.y_pred_all_xgb.append(list(y_test_one)[0])
            x_train = x_train.append(x_test_one)
            y_train.append(y_test[i])

        xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb)
        xgboost_rmse = np.sqrt(xgboost_mse)
        y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb))
        ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb
        return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
Esempio n. 22
0
    def __train_model(self, features):
        combo_list = [
            ['available_year_avg', 'min_nights_year_avg', 'price_year_avg']
        #     ['available_winter_avg', 'min_nights_winter_avg', 'price_winter_avg'],
        #     ['available_spring_avg', 'min_nights_spring_avg', 'price_spring_avg'],
        #     ['available_summer_avg', 'min_nights_summer_avg', 'price_summer_avg']
        ]
        for combo in combo_list:
            X_base = features.drop([
                'price_year_avg', 'price_winter_avg', 'price_spring_avg', 'price_summer_avg', 'price_fall_avg',
                'available_year_avg', 'available_winter_avg', 'available_spring_avg', 'available_summer_avg', 'available_fall_avg',
                'min_nights_year_avg', 'min_nights_winter_avg', 'min_nights_spring_avg', 'min_nights_summer_avg', 'min_nights_fall_avg'
            ], axis=1)
            X_base[combo[0]] = features[combo[0]]
            X_base[combo[1]] = features[combo[1]]
            y = features[combo[2]]
            X_train, X_test, y_train, y_test = train_test_split(X_base, y, test_size=.25, random_state=42, shuffle=True)

            model = XGBRegressor(
                objective='reg:squarederror',
                learning_rate=0.1,
                max_depth=8,
                n_estimators=200,
                cv=5,
                n_jobs=-1
            )
            model.fit(X_train, y_train)
            self.logger.info('Gradient boost model:')
            self.logger.info(f'Target label: {combo[2]}')
            self.logger.info(f'R^2: {model.score(X_test, y_test)}')
            self.logger.info(f'MAE: {mean_absolute_error(y_test, model.predict(X_test))}')
            return model
Esempio n. 23
0
    def _build(self, **config):
        """
        build the models and initialize.
        :param config: hyper parameters for building the model
        :return:
        """
        self.set_params(**config)
        if self.model_type == "regressor":
            self.model = XGBRegressor(n_estimators=self.n_estimators, max_depth=self.max_depth,
                                      n_jobs=self.n_jobs, tree_method=self.tree_method,
                                      random_state=self.random_state,
                                      learning_rate=self.learning_rate,
                                      min_child_weight=self.min_child_weight, seed=self.seed,
                                      subsample=self.subsample,
                                      colsample_bytree=self.colsample_bytree,
                                      gamma=self.gamma, reg_alpha=self.reg_alpha,
                                      reg_lambda=self.reg_lambda, verbosity=self.verbosity)
        elif self.model_type == "classifier":
            self.model = XGBClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
                                       n_jobs=self.n_jobs, tree_method=self.tree_method,
                                       random_state=self.random_state,
                                       learning_rate=self.learning_rate,
                                       min_child_weight=self.min_child_weight, seed=self.seed,
                                       subsample=self.subsample,
                                       colsample_bytree=self.colsample_bytree,
                                       gamma=self.gamma, reg_alpha=self.reg_alpha,
                                       objective='binary:logistic',
                                       reg_lambda=self.reg_lambda, verbosity=self.verbosity)
        else:
            raise ValueError("model_type can only be \"regressor\" or \"classifier\"")

        self.model_init = True
def cbd_model(cbd_df,cbd_finalinput):
    '''
    function that creates model from the cbd dataframe and returns the predicted
    number of crimes for the next three days
    '''

    X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index',
       'gdp_millions_2007', 'seasonally_adjusted_unemployment',
       'unadjusted_unemployment', 'Possession, cocaine ',
       'Heroin, possession ', 'Heroin Price Canada',
       'day_segment_1200pm-1159pm', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday']]
    y_cbd=cbd_df['number_of_crimes']


    scaler = StandardScaler()
    scaler.fit(X_cbd)  # Don't cheat - fit only on training data
    X_cbd = scaler.transform(X_cbd)
    cbd_input_scaled = scaler.transform(cbd_finalinput)
    xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
    xgb.fit(X_cbd,y_cbd)
    predict_cbd=xgb.predict(cbd_input_scaled)

    return predict_cbd
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        xgb_base = XGBRegressor(objective='reg:linear',
                                n_estimators=ntree,
                                random_state=1234,
                                silent=0,
                                booster='gbtree',
                                eval_metric='rmse')
        rmse_t_1, rmse_v_1 = [], []
        print('此时 ntree = %s' % ntree)
        for train, test in get_cv(y=y_train, n_splits=5, random_state=42):
            X_t, y_t = X_train[train], y_train[train]
            X_v, y_v = X_train[test], y_train[test]
            xgb_base.fit(X_t, y_t)
            y_t_pre = xgb_base.predict(X_t)
            y_v_pre = xgb_base.predict(X_v)
            rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
            rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
            rmse_t_1.append(rmse_t_each)
            rmse_v_1.append(rmse_v_each)
        rmse_t = np.mean(rmse_t_1)
        rmse_v = np.mean(rmse_v_1)
        rmse_t_total.append(rmse_t)
        rmse_v_total.append(rmse_v)

    return rmse_t_total, rmse_v_total
def train_first_test(experiment_name, x_train, y_train, features):
    global file_loc
    file_loc = 'data/' + experiment_name + '/'
    from xgboost.sklearn import XGBRegressor
    import scipy.stats as st

    one_to_left = st.beta(10, 1)
    from_zero_positive = st.expon(0, 50)

    params = {
        "n_estimators": st.randint(3, 15),
        "max_depth": st.randint(3, 40),
        "learning_rate": st.uniform(0.05, 0.4),
        "colsample_bytree": one_to_left,
        "subsample": one_to_left,
        "gamma": st.uniform(0, 10),
        'reg_alpha': from_zero_positive,
        "min_child_weight": from_zero_positive,
    }
    #xgbreg = XGBRegressor(nthreads=-1)
    xgbreg = XGBRegressor()

    from sklearn.model_selection import RandomizedSearchCV
    gs = RandomizedSearchCV(xgbreg, params, n_jobs=1)
    gs.fit(x_train, y_train)

    joblib.dump(gs.best_estimator_, file_loc + 'clf_bestmodel.pkl')
    return gs.best_estimator_
    def __init__(self, nb_classes, bags=1, param={}):
        import xgboost as xgb
        from xgboost.sklearn import XGBRegressor

        self.nb_classes = nb_classes
        self.objective = param.get('objective','reg:linear')
        self.nthread = param.get('nthread',-1)
        self.n_estimators = param.get('n_estimators',10)
        self.max_depth = param.get('max_depth', 6)
        self.learning_rate = param.get('learning_rate', 0.3)
        self.colsample_bytree = param.get('colsample_bytree', 1.0)
        self.subsample = param.get('subsample', 1.0)
        self.missing = param.get('missing', None)
        self.seed = param.get('seed', 0)
        self.bags = bags
        self.bags_models = tuple()
        self.train_y = None
        for bag in range(self.bags):
            models = tuple()
            for k in range(self.nb_classes):
                model = XGBRegressor(objective = self.objective, nthread = self.nthread, seed = self.seed + bag,
                                     n_estimators = self.n_estimators, missing = self.missing,
                                     max_depth = self.max_depth, learning_rate = self.learning_rate,
                                     colsample_bytree = self.colsample_bytree, subsample = self.subsample)
                models = models + (model,)
            self.bags_models = self.bags_models + (models, )
Esempio n. 28
0
def learn_model(X_train, y_train, X_valid, y_valid):
    t1 = time()
    model = XGBRegressor(max_depth=7, n_estimators=500)
    model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=True, early_stopping_rounds=10)
    t2 = time()
    print('Total of training time: ', t2 - t1)
    return model
Esempio n. 29
0
def tun_reg_alpha(reg_alpha_range, param_data_path, train_x, train_y):
    '''
    tune the reg_alpha param in xgboost
    get the best param and save them to the file for further tuning

    :param reg_alpha_range: the range of reg_alpha you want to test

    :param param_data_path: default './../data/param_data.pkl'

    :return: void
    '''
    # get the newest param first
    param_dict = get_param_data(param_data_path=param_data_path)

    print "正则化参数reg_alpha调优"
    param_test1 = {'reg_alpha': reg_alpha_range}
    gsearch1 = GridSearchCV(estimator=XGBRegressor(**param_dict),
                            param_grid=param_test1,
                            scoring='neg_mean_squared_error',
                            iid=False,
                            cv=5)
    gsearch1.fit(X=train_x, y=train_y)

    # show the results
    for i in gsearch1.grid_scores_:
        print i
    print "best_params_ and best_score_:"
    print gsearch1.best_params_, gsearch1.best_score_

    # change some param and return
    param_dict['reg_alpha'] = gsearch1.best_params_['reg_alpha']

    save_param_data(param_dict=param_dict, param_data_path=param_data_path)
Esempio n. 30
0
def search_best_parameters(X, y):

    xgb_grid = {
        'n_estimators': [80, 100, 120],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1, 0.2, 0.5],
        'booster': ['gbtree', 'gblinear', 'dart'],
        'gamma': [0, 0.2, 0.5],
        'subsample': [0.5, 0.8],
        'reg_alpha': [0.2, 0.3, 0.5],
        'reg_lambda': [0.5, 0.8, 1],
        'colsample_bytree': [1, 0.8, 0.5],
        'colsample_bylevel': [1, 0.8, 0.5],
        'colsample_bynode': [1, 0.8, 0.5],
        'random_state': [77]
    }

    xgb_gridsearch = GridSearchCV(XGBRegressor(),
                                  xgb_grid,
                                  n_jobs=-1,
                                  verbose=True,
                                  scoring='r2')

    xgb_gridsearch.fit(X, y)
    print(f"best parameters: {xgb_gridsearch.best_params_}")