Beispiel #1
0
def test_sklearn_poisson_regression(nps_app_inst: ArrayApplication):
    def dsqr(dev_func, y, _y_pred):
        dev = dev_func(y, _y_pred)
        y_mean = nps_app_inst.mean(y)
        dev_null = dev_func(y, y_mean)
        return 1 - dev / dev_null

    from sklearn.linear_model import PoissonRegressor as SKPoissonRegressor

    coef = np.array([0.2, -0.1])
    real_X = np.array([[0, 1, 2, 3, 4]]).T
    real_y = np.exp(np.dot(real_X, coef[0]) + coef[1]).reshape(-1)
    X = nps_app_inst.array(real_X, block_shape=real_X.shape)
    y = nps_app_inst.array(real_y, block_shape=real_y.shape)
    param_set = [
        {"tol": 1e-4, "max_iter": 100},
    ]
    for kwargs in param_set:
        lr_model: PoissonRegression = PoissonRegression(**kwargs)
        lr_model.fit(X, y)
        y_pred = lr_model.predict(X).get()
        print("D^2", dsqr(lr_model.deviance, y, y_pred).get())

        sk_lr_model = SKPoissonRegressor(**kwargs)
        sk_lr_model.fit(real_X, real_y)
        sk_y_pred = sk_lr_model.predict(real_X)
        print("D^2", dsqr(lr_model.deviance, y, sk_y_pred).get())
def PoissonReg(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = PoissonRegressor()
    reg1.fit(X_train, y_train1)
    reg2 = PoissonRegressor()
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    logSave(nameOfModel="PoissonReg",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
Beispiel #3
0
def main(lr, train_path, eval_path, save_path, save_img):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    train = pd.read_csv(train_path)
    x_train, y_train = train[['x_1', 'x_2', 'x_3',
                              'x_4']], train[['y']].values.ravel()
    glm = PoissonRegressor(tol=1e-5, max_iter=10000000)
    glm.fit(x_train, y_train)

    valid = pd.read_csv(eval_path)
    x_eval, y_eval = valid[['x_1', 'x_2', 'x_3',
                            'x_4']], valid[['y']].values.ravel()
    predictions = glm.predict(x_eval)

    np.savetxt(save_path, predictions)
    util.scatter(y_eval, predictions, save_img)
    print(glm.coef_)
    print(glm.score(x_eval, y_eval))
def PoissonRegGS(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = PoissonRegressor()
    reg2 = PoissonRegressor()
    grid_values = {'alpha': list(range(1, 3))}

    grid_reg1 = GridSearchCV(
        reg1,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg1.fit(X_train, y_train1)
    reg1 = grid_reg1.best_estimator_
    reg1.fit(X_train, y_train1)
    grid_reg2 = GridSearchCV(
        reg2,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg2.fit(X_train, y_train2)
    reg2 = grid_reg1.best_estimator_
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    best_params1: dict = grid_reg1.best_params_
    best_params2: dict = grid_reg2.best_params_
    best_params = {}
    for key in best_params1.keys():
        best_params[key] = [best_params1[key], best_params2[key]]
    saveBestParams(nameOfModel="PoissonRegGS", best_params=best_params)
    logSave(nameOfModel="PoissonRegGS",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
Beispiel #5
0
def get_trained_model(X, y):
    #Split data into test verification set and training set
    X_Train, X_Test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=1)
    print('got here')

    print('Training:\n')
    #Switching to Poisson from Linear brought RMSE down from 2614.92 to 2281.12
    mlModel = PoissonRegressor(
    )  #create model object #Switched from LinearRegression to PoissonRegressor
    mlModel.fit(X_Train, y_train.values.ravel())  #train model object
    return mlModel
def sk_poisson_regression(X_train, X_test, y_train, y_test):
    glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300)
    glm.fit(X_train, y_train)
    print('score: ', glm.score(X_test, y_test))

    y_hat = glm.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
def test_poisson_glmnet():
    """Compare Poisson regression with L2 regularization and LogLink to glmnet"""
    # library("glmnet")
    # options(digits=10)
    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
    # x <- data.matrix(df[,c("a", "b")])
    # y <- df$y
    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
    #               standardize=F, thresh=1e-10, nlambda=10000)
    # coef(fit, s=1)
    # (Intercept) -0.12889386979
    # a            0.29019207995
    # b            0.03741173122
    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
    y = np.array([0, 1, 1, 2])
    glm = PoissonRegressor(
        alpha=1,
        fit_intercept=True,
        tol=1e-7,
        max_iter=300,
    )
    glm.fit(X, y)
    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
Beispiel #8
0
 def poisson_regression(self, df, split=0.7):
     split = np.random.rand(len(df)) < split
     df = df[self.select_cols]
     df = pd.get_dummies(df, columns=self.dummy_cols, drop_first=False)
     y_train, x_train, y_test, x_test = self.get_split(df, split)
     model = PoissonRegressor()
     result = model.fit(x_train, y_train)
     x_train.to_csv('x_train.csv')
     result_dict = {
         'model': result,
         'score': result.score(x_train, y_train),
         'intercept': result.intercept_,
         'parameters': {
             x_train.columns[j]: result.coef_[j]
             for j in range(len(result.coef_))
         }
     }
     return result_dict
Beispiel #9
0
#!/usr/bin/env python
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))
def test_warm_start(solver, fit_intercept, global_random_seed):
    n_samples, n_features = 100, 10
    X, y = make_regression(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_features - 2,
        bias=fit_intercept * 1.0,
        noise=1.0,
        random_state=global_random_seed,
    )
    y = np.abs(y)  # Poisson requires non-negative targets.
    alpha = 1
    params = {
        # "solver": solver,  # only lbfgs available
        "fit_intercept": fit_intercept,
        "tol": 1e-10,
    }

    glm1 = PoissonRegressor(warm_start=False,
                            max_iter=1000,
                            alpha=alpha,
                            **params)
    glm1.fit(X, y)

    glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params)
    # As we intentionally set max_iter=1 such that the solver should raise a
    # ConvergenceWarning.
    with pytest.warns(ConvergenceWarning):
        glm2.fit(X, y)

    linear_loss = LinearModelLoss(
        base_loss=glm1._get_loss(),
        fit_intercept=fit_intercept,
    )
    sw = np.full_like(y, fill_value=1 / n_samples)

    objective_glm1 = linear_loss.loss(
        coef=np.r_[glm1.coef_,
                   glm1.intercept_] if fit_intercept else glm1.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    objective_glm2 = linear_loss.loss(
        coef=np.r_[glm2.coef_,
                   glm2.intercept_] if fit_intercept else glm2.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    assert objective_glm1 < objective_glm2

    glm2.set_params(max_iter=1000)
    glm2.fit(X, y)
    # The two models are not exactly identical since the lbfgs solver
    # computes the approximate hessian from previous iterations, which
    # will not be strictly identical in the case of a warm start.
    assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4)
    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
#
# The number of claims (``ClaimNb``) is a positive integer (0 included).
# Thus, this target can be modelled by a Poisson distribution.
# It is then assumed to be the number of discrete events occurring with a
# constant rate in a given time interval (``Exposure``, in units of years).
# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a
# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`.

df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

# The parameters of the model are estimated by minimizing the Poisson deviance
# on the training set via a quasi-Newton solver: l-BFGS. Some of the features
# are collinear, we use a weak penalization to avoid numerical issues.
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train,
             df_train["Frequency"],
             sample_weight=df_train["Exposure"])

scores = score_estimator(
    glm_freq,
    X_train,
    X_test,
    df_train,
    df_test,
    target="Frequency",
    weights="Exposure",
)
print("Evaluation of PoissonRegressor on target Frequency")
print(scores)

# %%
Beispiel #12
0
 def poissonregressor(self,X_train,X_test,y_train,y_test):
     
     regressor= PoissonRegressor()
     regfit=regressor.fit(self.X_train,self.y_train)
     return regressor.predict(self.X_test)
Beispiel #13
0
    def test_poisson(self):
        # to do
        n = 100
        p = 20
        k = 3
        family = "poisson"
        rho = 0.5
        sigma = 1
        M = 1
        np.random.seed(3)
        data = gen_data(n, p, family=family, k=k, rho=rho, sigma=sigma)
        data2 = gen_data_splicing(family=family, n=n, p=p, k=k, rho=rho, M=M)
        support_size = range(0, 20)

        model = abessPoisson(path_type="seq",
                             support_size=support_size,
                             ic_type='ebic',
                             is_screening=True,
                             screening_size=20,
                             K_max=10,
                             epsilon=10,
                             powell_path=2,
                             s_min=1,
                             s_max=p,
                             lambda_min=0.01,
                             lambda_max=100,
                             is_cv=True,
                             K=5,
                             exchange_num=2,
                             tau=0.1 * np.log(n * p) / n,
                             primary_model_fit_max_iter=10,
                             primary_model_fit_epsilon=1e-6,
                             early_stop=False,
                             approximate_Newton=True,
                             ic_coef=1.,
                             thread=5,
                             sparse_matrix=True)
        group = np.linspace(1, p, p)
        model.fit(data.x, data.y, group=group)

        model2 = abessPoisson(path_type="seq",
                              support_size=support_size,
                              ic_type='ebic',
                              is_screening=True,
                              screening_size=20,
                              K_max=10,
                              epsilon=10,
                              powell_path=2,
                              s_min=1,
                              s_max=p,
                              lambda_min=0.01,
                              lambda_max=100,
                              is_cv=True,
                              K=5,
                              exchange_num=2,
                              tau=0.1 * np.log(n * p) / n,
                              primary_model_fit_max_iter=80,
                              primary_model_fit_epsilon=1e-6,
                              early_stop=False,
                              approximate_Newton=False,
                              ic_coef=1.,
                              thread=5)
        group = np.linspace(1, p, p)
        model2.fit(data.x, data.y, group=group)
        model2.predict(data.x)

        nonzero_true = np.nonzero(data.coef_)[0]
        nonzero_fit = np.nonzero(model2.coef_)[0]
        print(nonzero_true)
        print(nonzero_fit)
        assert (nonzero_true == nonzero_fit).all()

        if sys.version_info[1] >= 6:
            new_x = data.x[:, nonzero_fit]
            reg = PoissonRegressor(alpha=0, tol=1e-6, max_iter=200)
            reg.fit(new_x, data.y)
            print(model2.coef_[nonzero_fit])
            print(reg.coef_)
            assert model2.coef_[nonzero_fit] == approx(reg.coef_,
                                                       rel=1e-2,
                                                       abs=1e-2)
Beispiel #14
0
# Alpha = 100
regr_l2_100 = linear_model.Ridge(alpha=100)
scores_length_l2_100_reg = cross_val_score(regr_l2_100, X_train_std, y_train, cv=5, scoring='r2') 
regr_l2_100.fit(X_train_std, y_train)
#print(scores_length_l2_100_reg)
#The mean score and the standard deviation are hence given by:
print("%0.2f (with L2 alpha = 100) accuracy with a standard deviation of %0.2f" % (scores_length_l2_100_reg.mean(), scores_length_l2_100_reg.std()))
#print(patient)

# Commented out IPython magic to ensure Python compatibility.
# Modeling with Poisson Regressor

import sklearn
from sklearn.linear_model import PoissonRegressor
regr = PoissonRegressor(alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0)
regr.fit(X_train_std, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test_std)

from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

# The coefficients
# print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
#       % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
#       % r2_score(y_test, y_pred))