Ejemplo n.º 1
0
                                                    test_size=0.2,
                                                    random_state=99)

#2. 모델 구성
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)
# n_estimators는 딥러닝의 epochs와 같음

#3. 훈련
model.fit(x_train,
          y_train,
          verbose=False,
          eval_metric="rmse",
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)
# 딥러닝의 metrics가 있었음. 머신러닝의 지표는 rmse, mae, logloss, error(<=>acc), auc(정확도 acc의 친구)
# error가 0.8이면 acc가 0.2

#4. 평가
result = model.evals_result()
print("evals_result : \n", result)
# evals_result :
#  {'validation_0': {'rmse': [22.09964, 20.094713, 18.289314]}, 'validation_1': {'rmse': [21.539825, 19.548641, 17.804596]}}
# validation_0 == (x_train,y_train)의 결과
# validation_1 == (x_test, y_test)의 결과

#5. 예측
y_pred = model.predict(x_test)
r2 = r2_score(y_pred, y_test)
print("R2 : ", r2)
# R2 :  0.823625251495531
Ejemplo n.º 2
0
# x, y = load_boston(return_X_y=True)
datasets = load_boston()
x = datasets.data
y = datasets['target']

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

# 2. 모델
model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=8)

# 3. 훈련
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric='rmse',
          eval_set=[(x_train, y_train), (x_test, y_test)])

aaa = model.score(x_test, y_test)
print('aaa :', aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print('r2 :', r2)

print('====================================')
results = model.evals_result()
print(results)
Ejemplo n.º 3
0
    print(selection_x_train.shape)

    selection_model = XGBRegressor(n_estimators=3, n_jobs=-1)

    selection_model.fit(selection_x_train,
                        y_train,
                        verbose=False,
                        eval_metric=["rmse", "mae"],
                        eval_set=[(selection_x_train, y_train),
                                  (selection_x_test, y_test)],
                        early_stopping_rounds=3)

    y_pred = selection_model.predict(selection_x_test)

    results = selection_model.evals_result()

    print("evals_result : \n", results)

    score = r2_score(y_test, y_pred)

    print("Thresh=%.3f, n=%d, R2: %.2f%%" %
          (thresh, selection_x_train.shape[1], score * 100.0))

# (404, 3)

# evals_result :

#  {'validation_0': {'rmse': [17.212723, 12.439525, 9.133449], 'mae': [15.650868, 11.090322, 7.872841]},

# 'validation_1': {'rmse': [16.532173, 11.86516, 8.631524], 'mae': [15.215144, 10.711357, 7.452145]}}
Ejemplo n.º 4
0
eval_set = [(X_train, y_train), (X_val, y_val)]
xgb.fit(X_train,
        y_train,
        early_stopping_rounds=10,
        eval_metric=["rmse"],
        eval_set=eval_set,
        verbose=True)
# make predictions for test data
y_pred = xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
r2_score = r2(y_test, predictions)
print("r2_score: %.2f%%" % (r2_score * 100.0))
# retrieve performance metrics
results = xgb.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
pyplot.ylabel('Root Mean Squared Error')
pyplot.title('XGBoost Root Mean Squared Error')
pyplot.show()

importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': np.round(xgb.feature_importances_, 3)
})
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

#2. 모델
model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=-1)

#3. 훈련
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric=['rmse', 'logloss', 'mae'],
          eval_set=[(x_train, y_train), (x_test, y_test)])

#4. 평가
aaa = model.score(x_test, y_test)
print("aaa : ", aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)  #r2잡을 때 원 데이터가 앞으로 가게?
print("r2 : ", r2)

# aaa :  0.9329663244922279
# r2 :  0.9329663244922279

print("=======================")
results = model.evals_result()  # 터미널에서 훈련 셋 지표(rmse)가 줄어드는 과정 표기
print(results)
def xgboost_regress(X_train,
                    y_train,
                    X_test,
                    y_test,
                    early_stopping_rounds=None,
                    plot=True):
    # Build fit model
    XG = XGBRegressor(objective='reg:squarederror',
                      n_estimators=200,
                      min_child_weight=1,
                      max_depth=3,
                      subsample=0.7,
                      colsample_bytree=0.5,
                      learning_rate=0.1)

    eval_set = [(X_train, y_train), (X_test, y_test)]
    XG.fit(X_train,
           y_train,
           eval_metric="rmse",
           early_stopping_rounds=early_stopping_rounds,
           eval_set=eval_set,
           verbose=False)

    # Make predictions and evaluate
    preds_train = XG.predict(X_train)
    preds_test = XG.predict(X_test)
    rms_train = (mean_squared_error(y_train, preds_train))**0.5
    rms_test = (mean_squared_error(y_test, preds_test))**0.5
    r2_train = r2_score(y_train, preds_train)
    r2_test = r2_score(y_test, preds_test)
    mae_train = mean_absolute_error(y_train, preds_train)
    mae_test = mean_absolute_error(y_test, preds_test)
    results = XG.evals_result()
    epochs = len(results['validation_0']['rmse'])

    # Plot progress over epochs and final true vs predicted age
    if plot:
        fig, ax = plt.subplots(1, 3, figsize=(16, 3.5))
        ax[0].scatter(y_train, preds_train, alpha=0.5)
        ax[0].plot(range(20, 100), range(20, 100), c='red')
        ax[0].set_xlabel('True Age')
        ax[0].set_ylabel('Predicted Age')
        ax[0].grid(True, lw=1.5, ls='--', alpha=0.75)
        ax[0].set_title('XGboost on training data')

        ax[1].scatter(y_test, preds_test, alpha=0.5)
        ax[1].plot(range(20, 100), range(20, 100), c='red')
        ax[1].set_xlabel('True Age')
        ax[1].set_ylabel('Predicted Age')
        ax[1].grid(True, lw=1.5, ls='--', alpha=0.75)
        ax[1].set_title('XGboost on testing data')

        x_axis = range(0, epochs)
        ax[2].plot(x_axis, results['validation_0']['rmse'], label='Train')
        ax[2].plot(x_axis, results['validation_1']['rmse'], label='Test')
        ax[2].legend()
        ax[2].set_ylabel('rms')
        ax[2].set_xlabel('epoch')
        ax[2].set_title('XGBoost rms')
        plt.show()

    # print metric
    print(f'The number of training epochs was {epochs}')
    print(f'The rms on the training data is {rms_train:.3f} years')
    print(f'The rms on the testing data is {rms_test:.3f} years')
    print(f'The r^2 on the training data is {r2_train:.3f}')
    print(f'The r^2 on the testing data is {r2_test:.3f}')
    print(f'The MAE on the training data is {mae_train:.3f} years')
    print(f'The MAE on the testing data is {mae_test:.3f} years')
    return XG, rms_train, rms_test, r2_train, r2_test, XG.feature_importances_
Ejemplo n.º 7
0
def runXGBRegressorTuning(X_train,
                          X_test,
                          y_train,
                          y_test,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          initial_max_depth=[3, 5, 7, 9],
                          initial_min_child_weight=[1, 3, 5],
                          objective='reg:linear',
                          learning_rate=0.1,
                          n_estimators=140,
                          max_depth=5,
                          min_child_weight=1,
                          reg_alpha=0,
                          reg_lambda=0,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8):
    # Tune max depth and min child weight - strongest bearing on model tuning
    best_score = 1000000000
    xgb_param_dict = dict(learning_rate=learning_rate,
                          n_estimators=n_estimators,
                          max_depth=max_depth,
                          min_child_weight=min_child_weight,
                          reg_alpha=reg_alpha,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          objective=objective,
                          reg_lambda=reg_lambda,
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27)
    xgb_model = XGBRegressor(**xgb_param_dict)

    param_test1 = {
        'max_depth': initial_max_depth,
        'min_child_weight': initial_min_child_weight
    }

    gsearch = GridSearchCV(estimator=XGBRegressor(**xgb_param_dict),
                           param_grid=param_test1,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)
    gsearch.fit(X_train, y_train)
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))

    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
    xgb_param_dict['min_child_depth'] = gsearch.best_params_['min_child_depth']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Decision tree to determine new search ranges if optimal solution found at limit of initial range
    if gsearch.best_params_['max_depth'] == max(initial_max_depth):
        print('Best max_depth at max limit of initial range...')
        new_initial_max_depth = range(max(initial_max_depth),
                                      max(initial_max_depth) + 6, 2)
    elif gsearch.best_params_['max_depth'] == min(initial_max_depth):
        print('Best max_depth at min limit of initial range...')
        new_initial_max_depth = range(
            min(initial_max_depth) - 6, min(initial_max_depth), 2)
    else:
        new_initial_max_depth = initial_max_depth

    if gsearch.best_params_['min_child_weight'] == max(
            initial_min_child_weight):
        print('Best min_child_weight at max limit of initial range...')
        new_initial_min_child_weight = range(max(initial_min_child_weight),
                                             max(initial_min_child_weight) + 6,
                                             2)
    elif gsearch.best_params_['min_child_weight'] == min(
            initial_min_child_weight):
        print('Best max_depth at min limit of initial range...')
        new_initial_min_child_weight = range(
            min(initial_min_child_weight) - 6, min(initial_min_child_weight),
            2)
    else:
        new_initial_min_child_weight = initial_min_child_weight

    # Run various procedures depending on outcome
    if new_initial_max_depth != initial_min_child_weight or new_initial_max_depth != initial_max_depth:
        param_test = {
            'max_depth': new_initial_max_depth,
            'min_child_weight': new_initial_min_child_weight
        }
        gsearch = GridSearchCV(estimator=xgb_model,
                               param_grid=param_test,
                               scoring=scoring,
                               n_jobs=4,
                               iid=False,
                               cv=cv)
        gsearch.fit(X_train, y_train)
        print('Best params: {}'.format(gsearch.best_params_))
        print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
        best_score = np.sqrt(-gsearch.best_score_)
        xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
        xgb_param_dict['min_child_depth'] = gsearch.best_params_[
            'min_child_depth']
        xgb_model = XGBRegressor(**xgb_param_dict)

    else:
        # Check either side of best variables to check
        param_test = {
            'max_depth': [
                xgb_param_dict['max_depth'] - 1, xgb_param_dict['max_depth'],
                xgb_param_dict['max_depth'] + 1
            ],
            'min_child_weight': [
                xgb_param_dict['min_child_weight'] - 1,
                xgb_param_dict['min_child_weight'],
                xgb_param_dict['min_child_weight'] + 1
            ]
        }
        gsearch = GridSearchCV(estimator=xgb_model,
                               param_grid=param_test,
                               scoring=scoring,
                               n_jobs=4,
                               iid=False,
                               cv=cv)
        gsearch.fit(X_train, y_train)
        # Fine-tuned max_depth and min_child_weight parameters
        print('Fine-tuned max_depth and min_child_weight parameters...\n')
        print('Best params: {}'.format(gsearch.best_params_))
        print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
        best_score = np.sqrt(-gsearch.best_score_)
        xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
        xgb_param_dict['min_child_weight'] = gsearch.best_params_[
            'min_child_weight']
        xgb_model = XGBRegressor(**xgb_param_dict)

    warnings = {}
    # Tune gamma
    param_test3 = {'gamma': [i / 10.0 for i in range(0, 5)]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test3,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned gamma parameters
    print('Fine-tuned gamma parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['gamma'] = gsearch.best_params_['gamma']
    xgb_model = XGBRegressor(**xgb_param_dict)

    if xgb_param_dict['gamma'] == max(param_test3['gamma']):
        warnings[
            'gamma'] = 'gamma: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['gamma'])

    # Tune subsample and colsample_bytree
    param_test4 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test4,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned subsample and colsample_bytree parameters
    print('Tuned subsample and colsample_bytree parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['subsample'] = gsearch.best_params_['subsample']
    xgb_param_dict['colsample_bytree'] = gsearch.best_params_[
        'colsample_bytree']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # while xgb_param_dict['subsample'] == max(param_test4['subsample'] or
    #       xgb_param_dict['colsample_bytree'] == max(param_test4['colsample_bytree']) or
    #       xgb_param_dict['subsample'] == min(param_test4['subsample'] or
    #       xgb_param_dict['colsample_bytree'] == min(param_test4['colsample_bytree']):

    if xgb_param_dict['subsample'] == max(param_test4['subsample']):
        warnings[
            'subsample'] = 'subsample: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['subsample'])
    elif xgb_param_dict['subsample'] == min(param_test4['subsample']):
        warnings[
            'subsample'] = 'subsample: Optimal parameter {} at min of search range'.format(
                xgb_param_dict['subsample'])

    if xgb_param_dict['colsample_bytree'] == max(
            param_test4['colsample_bytree']):
        warnings[
            'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['colsample_bytree'])
    elif xgb_param_dict['colsample_bytreee'] == min(
            param_test4['colsample_bytree']):
        warnings[
            'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at min of search range'.format(
                xgb_param_dict['colsample_bytree'])

    # Tune regularisation parameters
    param_test6 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test6,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned regularisation parameters
    print('Tuned regularisation parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Fine-tune regularisation parameters

    param_test7 = {
        'reg_alpha': [
            float(xgb_param_dict['reg_alpha']) / 10,
            float(xgb_param_dict['reg_alpha']) / 2,
            float(xgb_param_dict['reg_alpha']),
            float(xgb_param_dict['reg_alpha']) * 5,
            float(xgb_param_dict['reg_alpha']) * 2
        ]
    }

    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test6,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned regularisation parameters
    print('Tuned regularisation parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score)
    xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Tune the learning rate of the model

    cvresult = xgb.cv(xgb_model.get_params(),
                      X_train,
                      num_boost_round=xgb_model.get_params()['n_estimators'],
                      nfold=cv,
                      metrics='rmse',
                      early_stopping_rounds=50,
                      show_progress=False)

    # Set the model to the optimal number of estimators wrt early stopping round limit
    xgb_param_dict['n_estimators'] = cvresult.shape[0]

    # Learn final XGBoost model
    xgb_model = XGBRegressor(**xgb_param_dict)
    xgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_metric='rmse',
                  verbose=True)

    return xgb_model, xgb_model.get_params(), xgb_model.evals_result(
    ), warnings
xg_model = XGBRegressor(n_estimators=500,
                        learning_rate=0.075,
                        max_depth=7,
                        min_child_weight=5,
                        eval_metric='rmse',
                        seed=1337,
                        objective='reg:squarederror')
xg_model.fit(X_train,
             y_train,
             early_stopping_rounds=10,
             eval_set=[(X_test, y_test)],
             verbose=False)
predictions = xg_model.predict(X_test)

max_estimators = len(xg_model.evals_result()['validation_0']['rmse'])
print(max_estimators)
max_estim_rmse = pd.DataFrame(xg_model.evals_result()['validation_0']['rmse'],
                              columns=['rmse'])
plt.plot(max_estim_rmse)
plt.ylabel("RMSE")
plt.xlabel("Max Estimators")
xgb.plot_importance(xg_model)
plt.show()

# In[21]:

rmse_rf = sqrt(mean_squared_error(predictions, y_test))
print("RMSE:", round(rmse_rf, 2))

# In[22]:
Ejemplo n.º 9
0
Archivo: Model.py Proyecto: dlont/kbc
class VanillaModelRegression(Model):
    def __init__(self, configuration):
        self._configuration = configuration
        self._objects = {}
        self._annotation = 'Performance comparision of different MVA discriminants'
        if 'annotation' in self._configuration:
            self._annotation = self._configuration['annotation']
        self.my_model = None
        self.fit_results = None
        self.Initialize()

    @log_with()
    def Initialize(self):
        self.build_best_prediction()
        pass

    @log_with()
    def get(self, name):
        """
                Factory method
                """
        if name in self._objects:
            return self._objects[name]
        else:
            return None  #provide factory method implementation here
        return self._objects[name]

    @log_with()
    def get_data_provider(self, provider_name):
        """
                Factory method for data providers
                """
        from dataprovider import PandasDataProviderFromCSV_original
        if provider_name in self._objects:
            return self._objects[provider_name]
        else:
            if '.csv' in self._configuration[provider_name]['input_file']:
                provider = PandasDataProviderFromCSV_original(
                    self._configuration[provider_name]['input_file'])
                self._objects[provider_name] = provider
            else:
                raise NotImplementedError
        return self._objects[provider_name]

    @log_with()
    def build_best_prediction(self):
        print("Dummy building vanilla model!")

        from matplotlib import pyplot
        from xgboost import XGBRegressor, plot_importance
        # from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error
        from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

        target_variable_names = self._configuration['model']['target'][0]
        data_provider = self.get_data_provider(
            self._configuration['model']['data_provider'])

        input_features_names = self._configuration['model']['input_features']
        X_train = data_provider.train[input_features_names]
        y_train = data_provider.train[target_variable_names]

        X_test = data_provider.test[input_features_names]
        y_test = data_provider.test[target_variable_names]

        # print X_train.dtypes
        # print X_train.head()
        # print X_test.dtypes
        # print X_test.head()

        # print y_train.dtypes
        # print y_train.head()
        # print y_test.dtypes
        # print y_test.head()

        eval_set = [(X_train, y_train), (X_test, y_test)]

        self.my_model = XGBRegressor(
            n_estimators=self._configuration['model']['n_estimators'],
            max_depth=self._configuration['model']['max_depth'],
            learning_rate=self._configuration['model']['learning_rate'],
            verbosity=0)
        self.my_model.fit(X_train,
                          y_train,
                          eval_metric=["rmse", "mae"],
                          eval_set=eval_set,
                          verbose=False)

        y_pred = my_model.predict(X_test)
        # print "Max error: ", max_error(y_test,y_pred)
        print("Explained variance score: ",
              explained_variance_score(y_test, y_pred))
        print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
        print("Mean squared error: ", mean_squared_error(y_test, y_pred))

        self.fit_results = self.my_model.evals_result()
        # print 'YO importance'
        # plot_importance(my_model)
        pickle.dump(
            self.my_model,
            open(self._configuration['model']['output_filename'], 'wb'))

        pass
def xgb_train_and_predict(column_to_predict, train_data, evaluation_data,
                          data_path):
    """
    train a xgboost model on column_to_predict from train_data
    and generates predictions for evaluation_data which are stored in a column named `output`
    data_path specify path to data in order to compute external features
    """
    logger.info("----------- check training data -------------")
    for resolution, dtf in train_data.groupby(['cantine_nom', 'cantine_type']):
        logger.info(
            "canteen %s has %s days of history to train on starting on %s and ending on %s",
            resolution,
            len(dtf),
            dtf["date_str"].min(),
            dtf['date_str'].max(),
        )

    features = [
        "site_id",
        # "date_str",
        # "cantine_nom",
        # "site_type_cat",
        "secteur_cat",
        # "year",
        # "month",
        # "day",
        "week",
        "wednesday",  # this feature is only used if the dedicated parameter include_wednesday is set to True
        # "weekday",  # weekday is not used here because redundant with meal composition
        "holidays_in",
        "non_working_in",
        "effectif",
        "frequentation_prevue",
        "Events.RAMADAN_ago",  # "Events.AID_ago"
    ]

    with open(os.path.join(data_path, "calculators/menus.json")) as f_in:
        dict_special_dishes = json.load(f_in)
    features = features + list(dict_special_dishes.keys())

    # prepare training dataset
    train_data_reduced = train_data[features + [column_to_predict]]
    before_dropping_na = len(train_data_reduced)
    train_data_reduced.dropna(inplace=True)
    after_dropping_na = len(train_data_reduced)
    percent_dropped = round(100 * (before_dropping_na - after_dropping_na) /
                            before_dropping_na)
    logger.info("Dropping %s percent of training data due to NANs",
                percent_dropped)

    train_data_x = train_data_reduced[features]
    train_data_y = train_data_reduced[column_to_predict]
    if len(train_data_x) == 0:
        raise EmptyTrainingSet("")
    # prepare test_dataset to control overfitting
    train_data_x, train_data_y, test_data_x, test_data_y = ratio_split(
        train_data_x, train_data_y, 0.1)
    eval_set = [(train_data_x, train_data_y), (test_data_x, test_data_y)]

    # prepare prediction dataset
    evaluation_data_x = evaluation_data[features]

    params = {
        'base_score': train_data_y.mean(),
        "objective": 'reg:squarederror',
        "n_estimators": 5000,
        "learning_rate": 0.09,
        "max_depth": 5,
        "booster": 'gbtree',
        "colsample_bylevel": 1,
        "colsample_bynode": 1,
        "colsample_bytree": 1,
        "gamma": 0,
        "importance_type": 'gain',
        "max_delta_step": 0,
        "min_child_weight": 1,
        "missing": None,
        "n_jobs": mp.cpu_count(),
        "nthread": None,
        "random_state": 0,
        "reg_alpha": 0,
        "reg_lambda": 1,
        "scale_pos_weight": 1,
        "seed": None,
        "subsample": 1,
        "verbosity": 0,
    }
    # define model
    model = XGBRegressor(**params)
    # train model
    model.fit(train_data_x,
              train_data_y,
              early_stopping_rounds=100,
              eval_set=eval_set,
              eval_metric=multi_custom_metrics,
              verbose=False)
    # predict values
    evaluation_data['output'] = np.ceil(model.predict(evaluation_data_x))

    logger.info("----------- check predictions -------------")
    for resolution, dtf in evaluation_data.groupby(
        ['cantine_nom', 'cantine_type']):
        logger.info(
            "canteen %s has predictions for %s days starting on %s and ending on %s",
            resolution,
            len(dtf),
            dtf["date_str"].min(),
            dtf['date_str'].max(),
        )

    logger.info("----------- evaluate model -------------")

    feature_importance_list = evaluate_feature_importance(
        evaluation_data_x, model)

    plot_curve(model.evals_result(), "nantes_metropole_xgb")

    return evaluation_data, feature_importance_list
Ejemplo n.º 11
0
from sklearn.metrics import r2_score

boston = load_boston()

x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66)

xgb = XGBRegressor(n_estimators=10, learning_rate=0.1)

xgb.fit(x_train,
        y_train,
        verbose=True,
        eval_metric=["rmse", "logloss"],
        eval_set=[(x_train, y_train), (x_test, y_test)],
        early_stopping_rounds=20)
#rmse,mae,logloss,error,auc

y_pre = xgb.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = xgb.score(x_test, y_test)
result = xgb.evals_result()
print(__file__)
print(result)
print("r2")
print(r2)
print("score")
print(score)
Ejemplo n.º 12
0
# # train the model
# print("[INFO] training model...")
# model.fit(Xtrain, Ytrain, validation_data=(Xvalid, Yvalid),
# 	epochs=10, batch_size=20)
#dtrain=xgb.DMatrix(Xtrain,label=Ytrain)
#dvalid=xgb.DMatrix(Xvalid,label=Yvalid)
kf = KFold(n_splits=10, shuffle=True, random_state=seed)
vali = cross_val_score(bst, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1)
#print(bst.get_params())
print("####################Xgboost")
trainbst = bst.fit(Xtrain,
                   Ytrain,
                   eval_set=[(Xtrain, Ytrain), (Xvalid, Yvalid)],
                   eval_metric=['rmse', 'mae'],
                   verbose=True)
evres = bst.evals_result()  # See MAE metric
print(vali.mean())

plt.plot(list(evres['validation_0']['rmse']))
plt.plot(list(evres['validation_1']['rmse']))
plt.title('Model rmse')
plt.ylabel('rmse')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
#plt.savefig("Keras_NN_Accuracy.png")
plt.show()
plt.clf()

plt.plot(list(evres['validation_0']['mae']))
plt.plot(list(evres['validation_1']['mae']))
plt.title('Model mae')
def xgb_interval_train_and_predict(column_to_predict, train_data, evaluation_data, confidence_interval, data_path):
    """
    train a xgboost model on column_to_predict from train_data
    and generates predictions for evaluation_data which are stored in a column named `output`
    data_path specify path to data in order to compute external features
    Note: here, the model does not directly learn from column to_predict but from the bound of a confidence_interval
    see here for more details: https://towardsdatascience.com/confidence-intervals-for-xgboost-cac2955a8fde

    """
    features = [
        "site_id",
        "secteur_cat",
        "week",
        "wednesday",  # this feature is only used if the dedicated parameter include_wednesday is set to True
        "non_working_in",
        "holidays_in",
        "effectif",
        "frequentation_prevue",
        "Events.RAMADAN_ago",  # "Events.AID_ago"
    ]

    logger.info("----------- check training data -------------")
    for resolution, dtf in train_data.groupby(['cantine_nom', 'cantine_type']):
        logger.info("canteen %s has %s days of history to train on starting on %s and ending on %s",
                    resolution,
                    len(dtf),
                    dtf["date_str"].min(),
                    dtf['date_str'].max(),
                    )

    with open(os.path.join(data_path, "calculators/menus.json")) as f_in:
        dict_special_dishes = json.load(f_in)
    features = features + list(dict_special_dishes.keys())

    # prepare training dataset
    train_data_reduced = train_data[features + [column_to_predict]]
    before_dropping_na = len(train_data_reduced)
    train_data_reduced.dropna(inplace=True)
    after_dropping_na = len(train_data_reduced)
    percent_dropped = round(100 * (before_dropping_na - after_dropping_na) / before_dropping_na)
    logger.info("Dropping %s percent of training data due to NANs", percent_dropped)

    train_data_y = train_data_reduced[column_to_predict]
    train_data_x = train_data_reduced[features]
    if len(train_data_x) == 0:
        raise EmptyTrainingSet("")
    # prepare test_dataset to control overfitting
    train_data_x, train_data_y, test_data_x, test_data_y = ratio_split(train_data_x, train_data_y, 0.1)
    eval_set = [(train_data_x, train_data_y), (test_data_x, test_data_y)]

    # prepare prediction dataset
    evaluation_data_x = evaluation_data[features]

    params = {
        "n_jobs": mp.cpu_count(),
        'base_score': train_data_y.mean(),
        "objective": 'reg:squarederror',
        "n_estimators": 5000,
        "learning_rate": 0.09,
        "max_depth": 5,
        "booster": 'gbtree',
        "importance_type": 'gain',
        "max_delta_step": 0,
        "min_child_weight": 1,
        "random_state": 0,
        "reg_alpha": 0,
        "reg_lambda": 1,
        "scale_pos_weight": 1,
        "subsample": 1,
        "verbosity": 0,
    }

    confidence_step = (1 - confidence_interval) / 2
    # under predict
    params.update({"objective": log_cosh_quantile(1 - confidence_step)})

    confidence_upper_bound_model = XGBRegressor(**params)
    confidence_upper_bound_model.fit(
        train_data_x,
        train_data_y,
        early_stopping_rounds=100,
        eval_set=eval_set,
        eval_metric=multi_custom_metrics,
        verbose=False)
    y_upper_smooth = np.ceil(confidence_upper_bound_model.predict(evaluation_data_x))

    # over predict
    params.update({"objective": log_cosh_quantile(confidence_step)})
    confidence_lower_bound_model = XGBRegressor(**params)
    confidence_lower_bound_model.fit(train_data_x, train_data_y, verbose=False)
    y_lower_smooth = np.ceil(confidence_lower_bound_model.predict(evaluation_data_x))

    evaluation_data['pred_lower_bound'] = y_lower_smooth
    evaluation_data['pred_upper_bound'] = y_upper_smooth
    evaluation_data['output'] = np.maximum.reduce([y_upper_smooth, y_lower_smooth])

    logger.info("----------- check predictions -------------")
    for resolution, dtf in evaluation_data.groupby(['cantine_nom', 'cantine_type']):
        logger.info("canteen %s has predictions for %s days starting on %s and ending on %s",
                    resolution,
                    len(dtf),
                    dtf["date_str"].min(),
                    dtf['date_str'].max(),
                    )

    logger.info("----------- evaluate model -------------")

    feature_importance_list = evaluate_feature_importance(evaluation_data_x, confidence_upper_bound_model)
    
    ## Generates errors on Windows with Reticulate
    plot_curve(confidence_upper_bound_model.evals_result(), "nantes_metropole_xgb")

    return evaluation_data, feature_importance_list
Ejemplo n.º 14
0
x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66)

model = XGBRegressor(n_estimators=100, learning_rate=0.1)  # 나무의 갯수(n_estimators)는 epoch 

model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set =[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100)
# eval_set은 validation_0이 x_train, y_train// validation1이 x_test, y_test
# {'validation_0': {'rmse': [21.584942, 19.552324, 17.718475]} , 'validation_1': {'rmse': [21.684599, 19.621567, 17.763321]}}

# train test val val지표가 중요

# rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구)

results = model.evals_result() # XGB 에서 사용
print("eval's results : ", results)

y_pred = model.predict(x_test)

r2 = r2_score(y_pred, y_test)
# print("r2 Score : %.2f%%:" %(r2*100.0))
print("r2 : ", r2)

import matplotlib.pyplot as plt

epochs = len(results['validation_0']['logloss'])    # epoch의 길이
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
Ejemplo n.º 15
0
    return (X_train, y_train), (X_test, y_test)


# Get training and test data
target_colnames = ['Open_x', 'High_x', 'Low_x', 'Close_x']
Path('./Feature_Engineering').mkdir(parents=True, exist_ok=True)


for colname in target_colnames:
    (X_train, y_train), (X_test, y_test) = get_feature_importance_data(data, column=colname, include_targets=False)

    regressor = XGBRegressor(gamma=0.0, n_estimators=200, learning_rate=0.05)
    xgbModel = regressor.fit(X_train, y_train,
                             eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

    eval_result = regressor.evals_result()
    training_rounds = range(len(eval_result['validation_0']['rmse']))


    #########################
    # Train Validation Plot #
    #########################
    # plt.scatter(x=training_rounds, y=eval_result['validation_0']['rmse'], label='Training Error')
    # plt.scatter(x=training_rounds, y=eval_result['validation_1']['rmse'], label='Validation Error')
    # plt.xlabel('Iterations')
    # plt.ylabel('RMSE')
    # plt.title('Training Vs. Validation Error')
    # plt.legend()
    # plt.savefig(f'./Feature_Engineering/{colname}_train_val_history.png')