Beispiel #1
0
def get_booster_model(data_train, groups_train):
    """Gets model and define its parameters. For finding the optimal number
    of iterations, cross-validation is applied.

    Parameters
    ----------
    data_train: Train data readable for the package gpbooster,
    should contain the information about X_train and y_train
    groups_train: Group indices

    Returns
    -------
    gp_model
        Instance of the Gradient Tree boosting model with random effects
    params
        Parameters with which the model should be trained
    opt_num_boost_rounds
        Optimal number of boosting rounds for the training, found with cross-
        validation
        """
    logging.info('Getting booster model')
    gp_model = gpb.GPModel(group_data=groups_train)
    gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
    params = {
        'objective': 'regression_l2',
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_data_in_leaf': 5,
        'verbose': 0
    }
    logging.info('Calculating optimal number of boost rounds \
        via cross-validation')
    cvbst = gpb.cv(params=params,
                   train_set=data_train,
                   gp_model=gp_model,
                   use_gp_model_for_validation=True,
                   num_boost_round=300,
                   early_stopping_rounds=5,
                   nfold=3,
                   verbose_eval=False,
                   show_stdv=False,
                   seed=1)
    opt_num_boost_rounds = np.argmin(cvbst['l2-mean'])
    return gp_model, params, opt_num_boost_rounds
# train model
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32)
gp_model.summary() # estimated covariance parameters
# Covariance parameters in the following order:
# ['Error_term', 'Group_1']
# [0.9183072 1.013057 ]

# Make predictions
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect
np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25

# Parameter tuning using cross-validation (only number of boosting iterations)
gp_model = gpb.GPModel(group_data=group_train)
cvbst = gpb.cv(params=params, train_set=data_train,
               gp_model=gp_model, use_gp_model_for_validation=False,
               num_boost_round=100, early_stopping_rounds=5,
               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
# Best number of iterations: 32

# --------------------Model interpretation----------------
# Plotting feature importances
gpb.plot_importance(bst)

# Partial dependence plots
from pdpbox import pdp
# Single variable plots (takes a few seconds to compute)
pdp_dist = pdp.pdp_isolate(model=bst, dataset=X_train, model_features=X_train.columns,
                           feature='variable_2', num_grid_points=50)
pdp.pdp_plot(pdp_dist, 'variable_2', plot_lines=True)