def test_gpboost(): try: import gpboost except: print("Skipping test_gpboost!") return import shap # train gpboost model X, y = shap.datasets.boston() data_train = gpboost.Dataset(X, y, categorical_feature=[8]) model = gpboost.train(params={'objective': 'regression_l2', 'learning_rate': 0.1, 'verbose': 0}, train_set=data_train, num_boost_round=10) # explain the model's predictions using SHAP values ex = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") shap_values = ex.shap_values(X) predicted = model.predict(X, raw_score=True) assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-4, \ "SHAP values don't sum to model output!"
def train(X): """Trains the Gradient tree boosting model with random effects. It automatically reads and processes the data from the database. Parameters ---------- X: Covariable-cleaned database Returns ------- bst The trained Booster model. X_train Train data y_train Response train data data_train Train data readable for the package gpbooster, contains the information about X_train and y_train groups_train Group indices gp_model Instance of the Gradient Tree boosting model with random effects opt_num_boost_rounds Optimal number of boosting rounds for the training, found with cross- validation """ logging.info('Starting the train') X_train, y_train, data_train, groups_train = retrieve_training_dataset(X) gp_model, params, opt_num_boost_rounds = get_booster_model( data_train, groups_train) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=opt_num_boost_rounds) return (bst, X_train, y_train, data_train, groups_train, gp_model, opt_num_boost_rounds)
# split train and test data y_train = y[0:ntrain] y_test = y[ntrain:n] X_train = X.iloc[0:ntrain,] X_test = X.iloc[ntrain:n,] # --------------------Learning and prediction---------------- # Define and train GPModel gp_model = gpb.GPModel(group_data=group_train) # create dataset for gpb.train function data_train = gpb.Dataset(X_train, y_train) # specify tree-boosting parameters as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } # train model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32) gp_model.summary() # estimated covariance parameters # Covariance parameters in the following order: # ['Error_term', 'Group_1'] # [0.9183072 1.013057 ] # Make predictions pred = bst.predict(data=X_test, group_data_pred=group_test) y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25 # Parameter tuning using cross-validation (only number of boosting iterations) gp_model = gpb.GPModel(group_data=group_train) cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=False, num_boost_round=100, early_stopping_rounds=5,
gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) # create dataset for gpb.train data_train = gpb.Dataset(X, y) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting training...') # train bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=16) print("Estimated random effects model") gp_model.summary() print('Starting predicting...') # predict group_test = np.arange(m) Xtest = np.zeros((m, 2)) Xtest[:, 0] = np.linspace(0, 1, m) pred = bst.predict(data=Xtest, group_data_pred=group_test) # Compare true and predicted random effects plt.figure("Comparison of true and predicted random effects") plt.scatter(b1, pred['random_effect_mean']) plt.title("Comparison of true and predicted random effects") plt.xlabel("truth")
# specify your configurations as a dict params = { 'objective': 'regression_l2', 'metric': {'l2', 'l1'}, 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting training...') # train evals_result = {} # record eval results for plotting bst = gpb.train(params=params, train_set=data_train, num_boost_round=100, valid_sets=data_eval, early_stopping_rounds=5, evals_result=evals_result) # plot validation scores gpb.plot_metric(evals_result, metric='l1', figsize=(10, 5)) plt.show() print('Saving model...') # save model to file bst.save_model('model.txt') print('Starting predicting...') # predict y_pred = bst.predict(Xtest, num_iteration=bst.best_iteration) # eval
# Define random effects model (assuming firm and year random effects) gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) # Create dataset for gpb.train data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Specify boosting parameters as dict # Note: no attempt has been done to optimaly choose tuning parameters params = { 'objective': 'regression_l2', 'learning_rate': 1, 'max_depth': 6, 'min_data_in_leaf': 1, 'verbose': 0 } # Train GPBoost model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=1800) # Estimated random effects model (variances of random effects) gp_model.summary() # Cross-validation for determining number of boosting iterations gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=True, num_boost_round=5000, early_stopping_rounds=5, nfold=2, verbose_eval=True,
y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1) fig1, ax1 = plt.subplots() ax1.hist(y, bins=50) # visualize response variable #--------------------Training---------------- # create dataset for gpb.train data_train = gpb.Dataset(X, y) # Train model gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) # Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.: # gp_model.set_optim_params(params={"trace": True}) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=num_boost_round) gp_model.summary() # Trained random effects model (true variance = 0.5) # Showing training loss gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=num_boost_round, valid_sets=data_train) #--------------------Prediction---------------- nplot = 200 # number of predictions X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot))) group_data_pred = -np.ones(nplot) # only new / unobserved groups
plt.scatter(x_test[:, 1], y_test) plt.scatter(x_test[:, 0], y_test) # %% training gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential") data_train = gpb.Dataset(x_train, y) params = { 'objective': 'rmse', 'learning_rate': 0.01, 'max_depth': 3, 'min_data_in_leaf': 10, 'num_leaves': 2**10, 'verbose': -1 } bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=247) print('estimated covariance parameters') gp_model.summary() # %% predict pred = bst.predict(data=x_test, gp_coords_pred=coords_test, predict_var=True) y_pred = pred['fixed_effect'] + pred['random_effect_mean'] print("Mean square error (MSE): " + str(np.mean((y_pred - y_test)**2))) # Mean square error (MSE): 0.3942885572834001 # my env: Mean square error (MSE): 0.6205247584612723 # %% plt.scatter(y_pred, y_test) # %% shap_values = shap.TreeExplainer(bst).shap_values(x_train) shap.summary_plot(shap_values, x_train)
mu = np.exp(f + eps) y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1) fig1, ax1 = plt.subplots() ax1.hist(y, bins=50) # visualize response variable # create dataset for gpb.train data_train = gpb.Dataset(X, y) # Train model gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) # Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.: # gp_model.set_optim_params(params={"trace": True}) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=num_boost_round) gp_model.summary() # Trained random effects model # Make predictions nplot = 200 # number of predictions X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot))) group_data_pred = -np.ones(nplot) # Predict response variable pred_resp = bst.predict(data=X_test_plot, group_data_pred=group_data_pred, raw_score=False) # Predict latent variable including variance pred = bst.predict(data=X_test_plot, group_data_pred=group_data_pred, predict_var=True,
lp_test = lp_test * 5 + 0.2 y = np.random.normal(loc=lp, scale=1) y_test = np.random.normal(loc=lp_test, scale=1) # apply censoring yu = 8 yl = 5 y[y >= yu] = yu y[y <= yl] = yl # censoring fractions print(np.sum(y == yu) / n) print(np.sum(y == yl) / n) # train model and make predictions params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu} dtrain = gpb.Dataset(X, y) bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100) y_pred = bst.predict(X_test) # mean square error (approx. 1.1 for n=10'000) print("Test error of Grabit: " + str(((y_pred - y_test)**2).mean())) # compare to standard least squares gradient boosting (approx. 1.8 for n=10'000) params = {'objective': 'regression_l2', 'verbose': 0} bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100) y_pred_ls = bst.predict(X_test) print("Test error of standard least squares gradient boosting: " + str(((y_pred_ls - y_test)**2).mean())) # measure time import time params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu} dtrain = gpb.Dataset(X, y) start = time.time()