Example #1
0
def get_booster_model(data_train, groups_train):
    """Gets model and define its parameters. For finding the optimal number
    of iterations, cross-validation is applied.

    Parameters
    ----------
    data_train: Train data readable for the package gpbooster,
    should contain the information about X_train and y_train
    groups_train: Group indices

    Returns
    -------
    gp_model
        Instance of the Gradient Tree boosting model with random effects
    params
        Parameters with which the model should be trained
    opt_num_boost_rounds
        Optimal number of boosting rounds for the training, found with cross-
        validation
        """
    logging.info('Getting booster model')
    gp_model = gpb.GPModel(group_data=groups_train)
    gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
    params = {
        'objective': 'regression_l2',
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_data_in_leaf': 5,
        'verbose': 0
    }
    logging.info('Calculating optimal number of boost rounds \
        via cross-validation')
    cvbst = gpb.cv(params=params,
                   train_set=data_train,
                   gp_model=gp_model,
                   use_gp_model_for_validation=True,
                   num_boost_round=300,
                   early_stopping_rounds=5,
                   nfold=3,
                   verbose_eval=False,
                   show_stdv=False,
                   seed=1)
    opt_num_boost_rounds = np.argmin(cvbst['l2-mean'])
    return gp_model, params, opt_num_boost_rounds
Example #2
0
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
# incidence matrix relating grouped random effects to samples
Z1 = np.zeros((n, m))
for i in range(m):
    Z1[np.where(group == i), i] = 1
sigma2_1 = 1**2  # random effect variance
sigma2 = 0.5**2  # error variance
np.random.seed(1)
b1 = np.sqrt(sigma2_1) * np.random.normal(size=m)  # simulate random effects
eps = Z1.dot(b1)
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = eps + xi  # observed data

# Define and fit model
gp_model = gpb.GPModel(group_data=group)
gp_model.fit(y=y, std_dev=True)
gp_model.summary()

# Make predictions
group_test = np.arange(m)
pred = gp_model.predict(group_data_pred=group_test)
# Compare true and predicted random effects
plt.scatter(b1, pred['mu'])
plt.title("Comparison of true and predicted random effects")
plt.xlabel("truth")
plt.ylabel("predicted")
plt.show()
# Also predict covariance matrix
pred = gp_model.predict(group_data_pred=np.array([1, 1, 2, 2, -1, -1]),
                        predict_cov_mat=True)
Example #3
0
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
b1 = np.sqrt(0.5) * np.random.normal(size=m)  # simulate random effects
eps = b1[group]
eps = eps - np.mean(eps)
# simulate fixed effects
X = np.random.rand(n, 2)
f = f1d(X[:, 0])
# simulate response variable
probs = stats.norm.cdf(f + eps)
y = np.random.uniform(size=n) < probs
y = y.astype(np.float64)

# --------------------Parameter tuning using cross-validation: deterministic and random grid search----------------
# Create random effects model and Dataset
gp_model = gpb.GPModel(group_data=group, likelihood="bernoulli_probit")
data_train = gpb.Dataset(X, y)
# Other parameters not contained in the grid of tuning parameters
params = {'objective': 'binary', 'verbose': 0, 'num_leaves': 2**10}
# Small grid and deterministic grid search
param_grid_small = {
    'learning_rate': [0.1, 0.01],
    'min_data_in_leaf': [20, 100],
    'max_depth': [5, 10],
    'max_bin': [255, 1000]
}
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid_small,
                                             params=params,
                                             num_try_random=None,
                                             nfold=4,
                                             gp_model=gp_model,
    group_test[int(i * ntrain / m_test):int((i + 1) * ntrain / m_test)] = i
group = np.concatenate((group_train,group_test))
b = np.sqrt(sigma2_1) * np.random.normal(size=m_test)  # simulate random effects
Zb = b[group]
# Put everything together
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = F + Zb + xi  # observed data
# split train and test data
y_train = y[0:ntrain]
y_test = y[ntrain:n]
X_train = X.iloc[0:ntrain,]
X_test = X.iloc[ntrain:n,]

# --------------------Learning and prediction----------------
# Define and train GPModel
gp_model = gpb.GPModel(group_data=group_train)
# create dataset for gpb.train function
data_train = gpb.Dataset(X_train, y_train)
# specify tree-boosting parameters as a dict
params = { 'objective': 'regression_l2', 'learning_rate': 0.1,
    'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 }
# train model
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32)
gp_model.summary() # estimated covariance parameters
# Covariance parameters in the following order:
# ['Error_term', 'Group_1']
# [0.9183072 1.013057 ]

# Make predictions
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect
b_random_slope = 0.75 * np.random.normal(size=m)
y_crossed_random_slope = b[group] + b_crossed[
    group_crossed] + x * b_random_slope[group] + xi
# Simulate data for two nested random effects
m_nested = 200  # number of categories / levels for the second nested grouping variable
group_nested = np.arange(
    n)  # grouping variable for nested lower level random effects
for i in range(m_nested):
    group_nested[int(i * n / m_nested):int((i + 1) * n / m_nested)] = i
b_nested = 1. * np.random.normal(
    size=m_nested)  # nested lower level random effects
y_nested = b[group] + b_nested[group_nested] + xi  # observed data

# --------------------Grouped random effects model: single-level random effect----------------
# --------------------Training----------------
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
gp_model.fit(y=y, X=X, params={"std_dev": True})
gp_model.summary()
# Use other optimization specifications (gradient descent with Nesterov acceleration)
# and monitor convergence of optimization ("trace": True)
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
gp_model.fit(y=y,
             X=X,
             params={
                 "optimizer_cov": "gradient_descent",
                 "lr_cov": 0.1,
                 "std_dev": True,
                 "use_nesterov_acc": True,
                 "maxit": 100,
                 "trace": True
             })
Example #6
0
group = np.arange(n)  # grouping variable
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
# incidence matrix relating grouped random effects to samples
Z1 = np.zeros((n, m))
for i in range(m):
    Z1[np.where(group == i), i] = 1
sigma2_1 = 1 ** 2  # random effect variance
sigma2 = 0.1 ** 2  # error variance
b1 = np.sqrt(sigma2_1) * np.random.normal(size=m)  # simulate random effects
eps = Z1.dot(b1)
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = F + eps + xi  # observed data

# define GPModel
gp_model = gpb.GPModel(group_data=group)
gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting training...')
# train
bst = gpb.train(params=params,
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "bernoulli_logit":
    probs = 1 / (1 + np.exp(-(f + eps)))
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "poisson":
    mu = np.exp(f + eps)
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1)
plt.hist(y, bins=50)  # visualize response variable

# Train model
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
gp_model.fit(
    y=y, X=X)  # use option params={"trace": True} for monitoring convergence
gp_model.summary()

# Make predictions
group_test = np.arange(m)
X_test = np.column_stack((np.ones(m), np.zeros(m)))
# Predict latent variable
pred_lin = gp_model.predict(X_pred=X_test,
                            group_data_pred=group_test,
                            predict_var=True,
                            predict_response=False)
pred_lin['mu'][0:5]  # Predicted latent mean
pred_lin['var'][0:5]  # Predicted latent variance
# Predict response variable
Example #8
0
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
b1 = np.random.normal(size=m)  # simulate random effects
eps = b1[group]
# simulate fixed effects
def f1d(x):
    """Non-linear function for simulation"""
    return (1.7 * (1 / (1 + np.exp(-(x - 0.5) * 20)) + 0.75 * x))
X = np.random.rand(n, 2)
f = f1d(X[:, 0])
xi = np.sqrt(0.01) * np.random.normal(size=n)  # simulate error term
y = f + eps + xi  # observed data

print('Starting training...')
# define GPModel
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
# train
bst = gpb.GPBoostRegressor(max_depth=6,
                           learning_rate=0.05,
                           min_data_in_leaf=5,
                           n_estimators=15)
bst.fit(X, y, gp_model=gp_model)
print("Estimated random effects model")
gp_model.summary()

print('Starting predicting...')
# predict
group_test = np.arange(m)
Xtest = np.zeros((m, 2))
Xtest[:, 0] = np.linspace(0, 1, m)
pred = bst.predict(X=Xtest, group_data_pred=group_test)
Example #9
0
group = np.arange(n)  # grouping variable
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
# incidence matrix relating grouped random effects to samples
Z1 = np.zeros((n, m))
for i in range(m):
    Z1[np.where(group == i), i] = 1
sigma2_1 = 1**2  # random effect variance
sigma2 = 0.1**2  # error variance
b1 = np.sqrt(sigma2_1) * np.random.normal(size=m)  # simulate random effects
eps = Z1.dot(b1)
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = F + eps + xi  # observed data

# define GPModel
gp_model = gpb.GPModel(group_data=group)
gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting cross-validation...')
# do cross-validation
cvbst = gpb.cv(params=params,
Example #10
0
import gpboost as gpb
from statsmodels.datasets import grunfeld
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Load data
data = grunfeld.load_pandas().data
# Visualize response variable
plt.hist(data['invest'], bins=50)
plt.title("Histogram of response variable")
"""
Boosting with two crossed firm and year grouped random effects
"""
# Define random effects model (assuming firm and year random effects)
gp_model = gpb.GPModel(group_data=data[['firm', 'year']])
# Create dataset for gpb.train
data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest'])
# Specify boosting parameters as dict
# Note: no attempt has been done to optimaly choose tuning parameters
params = {
    'objective': 'regression_l2',
    'learning_rate': 1,
    'max_depth': 6,
    'min_data_in_leaf': 1,
    'verbose': 0
}
# Train GPBoost model
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
Example #11
0
# %% test data(generate like train)
x_test = np.random.rand(n_test * n_test, 2)
F_x_test = f1d(x_test[:, 0])
xi_test = np.sqrt(sigma2) * np.random.normal(size=n_test * n_test)
y_test = F_x_test + b_test + xi_test

# %% chk
plt.scatter(x_train[:, 1], y)
plt.scatter(x_train[:, 0], y)

# %%
plt.scatter(x_test[:, 1], y_test)
plt.scatter(x_test[:, 0], y_test)
# %% training
gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential")
data_train = gpb.Dataset(x_train, y)
params = {
    'objective': 'rmse',
    'learning_rate': 0.01,
    'max_depth': 3,
    'min_data_in_leaf': 10,
    'num_leaves': 2**10,
    'verbose': -1
}

bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=247)
print('estimated covariance parameters')
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "bernoulli_logit":
    probs = 1 / (1 + np.exp(-(f + eps)))
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "poisson":
    mu = np.exp(f + eps)
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1)
plt.hist(y, bins=50)  # visualize response variable

# --------------------Train model----------------
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
gp_model.fit(y=y, X=X)  # use params={"trace": True} for monitoring convergence
gp_model.summary()

# --------------------Make predictions----------------
group_test = np.arange(m)
X_test = np.column_stack((np.ones(m), np.zeros(m)))
# Predict latent variable
pred = gp_model.predict(X_pred=X_test, group_data_pred=group_test,
                        predict_var=True, predict_response=False)
print(pred['mu'][0:5])  # Predicted latent mean
print(pred['var'][0:5])  # Predicted latent variance
# Predict response variable
pred_resp = gp_model.predict(X_pred=X_test, group_data_pred=group_test,
                             predict_var=True, predict_response=True)
print(pred_resp['mu'][0:5])  # Predicted response variable (label)
Example #13
0
np.random.seed(1)
# Simulate grouped random effects
group = np.arange(n)  # grouping variable
for i in range(m):
    group[int(i * n / m):int((i + 1) * n / m)] = i
b1 = np.random.normal(size=m)  # simulate random effects
eps = b1[group]
# Simulate fixed effects
X = np.random.rand(n, 2)
f = f1d(X[:, 0])
xi = np.sqrt(0.01) * np.random.normal(size=n)  # simulate error term
y = f + eps + xi  # observed data

#--------------------Training----------------
# Define GPModel
gp_model = gpb.GPModel(group_data=group)
# The default optimizer for covariance parameters (hyperparameters) is Fisher scoring.
# This can be changed as follows:
# gp_model.set_optim_params(params={"optimizer_cov": "gradient_descent", "lr_cov": 0.05,
#                                   "use_nesterov_acc": True, "acc_rate_cov": 0.5})
# Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.:
# gp_model.set_optim_params(params={"trace": True})
# Create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# Specify boosting parameters as dict
params = {
    'objective': 'regression_l2',
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0