Beispiel #1
0
def main(lr, train_path, eval_path, save_path, save_img):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    train = pd.read_csv(train_path)
    x_train, y_train = train[['x_1', 'x_2', 'x_3',
                              'x_4']], train[['y']].values.ravel()
    glm = PoissonRegressor(tol=1e-5, max_iter=10000000)
    glm.fit(x_train, y_train)

    valid = pd.read_csv(eval_path)
    x_eval, y_eval = valid[['x_1', 'x_2', 'x_3',
                            'x_4']], valid[['y']].values.ravel()
    predictions = glm.predict(x_eval)

    np.savetxt(save_path, predictions)
    util.scatter(y_eval, predictions, save_img)
    print(glm.coef_)
    print(glm.score(x_eval, y_eval))
def sk_poisson_regression(X_train, X_test, y_train, y_test):
    glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300)
    glm.fit(X_train, y_train)
    print('score: ', glm.score(X_test, y_test))

    y_hat = glm.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
Beispiel #3
0
#!/usr/bin/env python
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))
def test_warm_start(solver, fit_intercept, global_random_seed):
    n_samples, n_features = 100, 10
    X, y = make_regression(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_features - 2,
        bias=fit_intercept * 1.0,
        noise=1.0,
        random_state=global_random_seed,
    )
    y = np.abs(y)  # Poisson requires non-negative targets.
    alpha = 1
    params = {
        # "solver": solver,  # only lbfgs available
        "fit_intercept": fit_intercept,
        "tol": 1e-10,
    }

    glm1 = PoissonRegressor(warm_start=False,
                            max_iter=1000,
                            alpha=alpha,
                            **params)
    glm1.fit(X, y)

    glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params)
    # As we intentionally set max_iter=1 such that the solver should raise a
    # ConvergenceWarning.
    with pytest.warns(ConvergenceWarning):
        glm2.fit(X, y)

    linear_loss = LinearModelLoss(
        base_loss=glm1._get_loss(),
        fit_intercept=fit_intercept,
    )
    sw = np.full_like(y, fill_value=1 / n_samples)

    objective_glm1 = linear_loss.loss(
        coef=np.r_[glm1.coef_,
                   glm1.intercept_] if fit_intercept else glm1.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    objective_glm2 = linear_loss.loss(
        coef=np.r_[glm2.coef_,
                   glm2.intercept_] if fit_intercept else glm2.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    assert objective_glm1 < objective_glm2

    glm2.set_params(max_iter=1000)
    glm2.fit(X, y)
    # The two models are not exactly identical since the lbfgs solver
    # computes the approximate hessian from previous iterations, which
    # will not be strictly identical in the case of a warm start.
    assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4)
    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
Beispiel #5
0
def modeling_compare(X, y):
    import pandas as pd
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import Ridge
    from sklearn.linear_model import RidgeCV
    from sklearn.model_selection import RepeatedKFold
    from sklearn.linear_model import ElasticNet
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import PoissonRegressor
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingRegressor
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import SGDRegressor
    from sklearn.neural_network import MLPClassifier
    from sklearn.ensemble import VotingRegressor

    models_lab = [
        'Linear Regression', 'Ridge', 'Ridge with tuning hyperparameters',
        'Elastic Net', 'Random Forest', 'Poisson Regression',
        'Gradient Boosting regression', 'Lasso', 'Stochastic Gradient Descent',
        'Neural Network', 'Voting Regression'
    ]

    reg1 = LinearRegression().fit(X, y)
    reg2 = Ridge().fit(X, y)
    reg3 = Ridge(alpha=0.2).fit(X, y)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid = dict()
    grid['alpha'] = arange(0, 1, 0.01)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    reg3 = RidgeCV(alphas=arange(0, 1, 0.01),
                   cv=cv,
                   scoring='neg_mean_absolute_error').fit(X, y)
    reg4 = ElasticNet().fit(X, y)
    reg5 = RandomForestRegressor().fit(X, y)
    reg6 = PoissonRegressor().fit(X, y)
    reg7 = HistGradientBoostingRegressor(loss='poisson',
                                         learning_rate=.01).fit(X, y)
    reg8 = Lasso().fit(X, y)
    reg9 = SGDRegressor(loss='squared_loss', penalty='l2').fit(X, y)
    reg10 = MLPClassifier(solver='lbfgs',
                          alpha=1e-5,
                          hidden_layer_sizes=(17, 10),
                          random_state=1).fit(X, y)

    # VotingRegressor without NN
    ereg = VotingRegressor(estimators=[('lr', reg1), ('rd', reg2), (
        'rs',
        reg3), ('en',
                reg4), ('rf',
                        reg5), ('pr',
                                reg6), ('gb',
                                        reg7), ('ls',
                                                reg8), ('gd',
                                                        reg9)]).fit(X, y)

    models_obj = [
        reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, ereg
    ]

    score = [
        reg1.score(X, y),
        reg2.score(X, y),
        reg3.score(X, y),
        reg4.score(X, y),
        reg5.score(X, y),
        reg6.score(X, y),
        reg7.score(X, y),
        reg8.score(X, y),
        reg9.score(X, y),
        reg10.score(X, y),
        ereg.score(X, y)
    ]

    score_df = pd.DataFrame()
    score_df['models_lab'] = models_lab
    score_df['models_obj'] = models_obj
    score_df['score'] = score

    return (score_df)