Esempio n. 1
0
def test_gpboost():
    try:
        import gpboost
    except:
        print("Skipping test_gpboost!")
        return
    import shap

    # train gpboost model
    X, y = shap.datasets.boston()
    data_train = gpboost.Dataset(X, y, categorical_feature=[8])
    model = gpboost.train(params={'objective': 'regression_l2', 'learning_rate': 0.1, 'verbose': 0},
                          train_set=data_train, num_boost_round=10)

    # explain the model's predictions using SHAP values
    ex = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
    shap_values = ex.shap_values(X)

    predicted = model.predict(X, raw_score=True)

    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-4, \
        "SHAP values don't sum to model output!"
Esempio n. 2
0
def train(X):
    """Trains the Gradient tree boosting model with random effects.
    It automatically reads and processes the data from the database.
    Parameters
    ----------
    X: Covariable-cleaned database

    Returns
    -------
    bst
        The trained Booster model.
    X_train
        Train data
    y_train
        Response train data
    data_train
        Train data readable for the package gpbooster, contains the information
        about X_train and y_train
    groups_train
        Group indices
    gp_model
        Instance of the Gradient Tree boosting model with random effects
    opt_num_boost_rounds
        Optimal number of boosting rounds for the training, found with cross-
        validation
    """
    logging.info('Starting the train')
    X_train, y_train, data_train, groups_train = retrieve_training_dataset(X)
    gp_model, params, opt_num_boost_rounds = get_booster_model(
        data_train, groups_train)
    bst = gpb.train(params=params,
                    train_set=data_train,
                    gp_model=gp_model,
                    num_boost_round=opt_num_boost_rounds)
    return (bst, X_train, y_train, data_train, groups_train, gp_model,
            opt_num_boost_rounds)
# split train and test data
y_train = y[0:ntrain]
y_test = y[ntrain:n]
X_train = X.iloc[0:ntrain,]
X_test = X.iloc[ntrain:n,]

# --------------------Learning and prediction----------------
# Define and train GPModel
gp_model = gpb.GPModel(group_data=group_train)
# create dataset for gpb.train function
data_train = gpb.Dataset(X_train, y_train)
# specify tree-boosting parameters as a dict
params = { 'objective': 'regression_l2', 'learning_rate': 0.1,
    'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 }
# train model
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32)
gp_model.summary() # estimated covariance parameters
# Covariance parameters in the following order:
# ['Error_term', 'Group_1']
# [0.9183072 1.013057 ]

# Make predictions
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect
np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25

# Parameter tuning using cross-validation (only number of boosting iterations)
gp_model = gpb.GPModel(group_data=group_train)
cvbst = gpb.cv(params=params, train_set=data_train,
               gp_model=gp_model, use_gp_model_for_validation=False,
               num_boost_round=100, early_stopping_rounds=5,
Esempio n. 4
0
gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting training...')
# train
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=16)
print("Estimated random effects model")
gp_model.summary()

print('Starting predicting...')
# predict
group_test = np.arange(m)
Xtest = np.zeros((m, 2))
Xtest[:, 0] = np.linspace(0, 1, m)
pred = bst.predict(data=Xtest, group_data_pred=group_test)
# Compare true and predicted random effects
plt.figure("Comparison of true and predicted random effects")
plt.scatter(b1, pred['random_effect_mean'])
plt.title("Comparison of true and predicted random effects")
plt.xlabel("truth")
Esempio n. 5
0
# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting training...')
# train
evals_result = {}  # record eval results for plotting
bst = gpb.train(params=params,
                train_set=data_train,
                num_boost_round=100,
                valid_sets=data_eval,
                early_stopping_rounds=5,
                evals_result=evals_result)

# plot validation scores
gpb.plot_metric(evals_result, metric='l1', figsize=(10, 5))
plt.show()

print('Saving model...')
# save model to file
bst.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = bst.predict(Xtest, num_iteration=bst.best_iteration)
# eval
Esempio n. 6
0
# Define random effects model (assuming firm and year random effects)
gp_model = gpb.GPModel(group_data=data[['firm', 'year']])
# Create dataset for gpb.train
data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest'])
# Specify boosting parameters as dict
# Note: no attempt has been done to optimaly choose tuning parameters
params = {
    'objective': 'regression_l2',
    'learning_rate': 1,
    'max_depth': 6,
    'min_data_in_leaf': 1,
    'verbose': 0
}
# Train GPBoost model
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=1800)
# Estimated random effects model (variances of random effects)
gp_model.summary()

# Cross-validation for determining number of boosting iterations
gp_model = gpb.GPModel(group_data=data[['firm', 'year']])
data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest'])
cvbst = gpb.cv(params=params,
               train_set=data_train,
               gp_model=gp_model,
               use_gp_model_for_validation=True,
               num_boost_round=5000,
               early_stopping_rounds=5,
               nfold=2,
               verbose_eval=True,
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1)
fig1, ax1 = plt.subplots()
ax1.hist(y, bins=50)  # visualize response variable

#--------------------Training----------------
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# Train model
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
# Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.:
# gp_model.set_optim_params(params={"trace": True})
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=num_boost_round)
gp_model.summary()  # Trained random effects model (true variance = 0.5)

# Showing training loss
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=num_boost_round,
                valid_sets=data_train)

#--------------------Prediction----------------
nplot = 200  # number of predictions
X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot)))
group_data_pred = -np.ones(nplot) # only new / unobserved groups
Esempio n. 8
0
plt.scatter(x_test[:, 1], y_test)
plt.scatter(x_test[:, 0], y_test)
# %% training
gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential")
data_train = gpb.Dataset(x_train, y)
params = {
    'objective': 'rmse',
    'learning_rate': 0.01,
    'max_depth': 3,
    'min_data_in_leaf': 10,
    'num_leaves': 2**10,
    'verbose': -1
}

bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=247)
print('estimated covariance parameters')
gp_model.summary()
# %% predict
pred = bst.predict(data=x_test, gp_coords_pred=coords_test, predict_var=True)
y_pred = pred['fixed_effect'] + pred['random_effect_mean']
print("Mean square error (MSE): " + str(np.mean((y_pred - y_test)**2)))
# Mean square error (MSE): 0.3942885572834001
# my env: Mean square error (MSE): 0.6205247584612723
# %%
plt.scatter(y_pred, y_test)
# %%

shap_values = shap.TreeExplainer(bst).shap_values(x_train)
shap.summary_plot(shap_values, x_train)
Esempio n. 9
0
    mu = np.exp(f + eps)
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1)
fig1, ax1 = plt.subplots()
ax1.hist(y, bins=50)  # visualize response variable

# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# Train model
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
# Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.:
# gp_model.set_optim_params(params={"trace": True})
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=num_boost_round)
gp_model.summary()  # Trained random effects model

# Make predictions
nplot = 200  # number of predictions
X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot)))
group_data_pred = -np.ones(nplot)
# Predict response variable
pred_resp = bst.predict(data=X_test_plot,
                        group_data_pred=group_data_pred,
                        raw_score=False)
# Predict latent variable including variance
pred = bst.predict(data=X_test_plot,
                   group_data_pred=group_data_pred,
                   predict_var=True,
Esempio n. 10
0
lp_test = lp_test * 5 + 0.2
y = np.random.normal(loc=lp, scale=1)
y_test = np.random.normal(loc=lp_test, scale=1)
# apply censoring
yu = 8
yl = 5
y[y >= yu] = yu
y[y <= yl] = yl
# censoring fractions
print(np.sum(y == yu) / n)
print(np.sum(y == yl) / n)

# train model and make predictions
params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu}
dtrain = gpb.Dataset(X, y)
bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100)
y_pred = bst.predict(X_test)
# mean square error (approx. 1.1 for n=10'000)
print("Test error of Grabit: " + str(((y_pred - y_test)**2).mean()))
# compare to standard least squares gradient boosting (approx. 1.8 for n=10'000)
params = {'objective': 'regression_l2', 'verbose': 0}
bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100)
y_pred_ls = bst.predict(X_test)
print("Test error of standard least squares gradient boosting: " +
      str(((y_pred_ls - y_test)**2).mean()))

# measure time
import time
params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu}
dtrain = gpb.Dataset(X, y)
start = time.time()