Exemple #1
0
def test_gamma_regression_family(regression_data):
    # Make sure the family attribute is read-only to prevent searching over it
    # e.g. in a grid search
    est = GammaRegressor()
    est.family == "gamma"

    msg = "GammaRegressor.family must be 'gamma'!"
    with pytest.raises(ValueError, match=msg):
        est.family = 0
    def __init__(self, correct_glm_bounds=True, recursive_forecast=False):

        # optional parameters
        self.correct_glm_bounds = correct_glm_bounds
        self.recursive_forecast = recursive_forecast

        # pipelines for the models.
        # Scaling for Poisson and Gamma Regression models, they use L2 regularization penalty
        self.pipe_lin_reg_ar = Pipeline([
            ('poly', PolynomialFeatures(1, include_bias=False)),
            ('scale', StandardScaler()), ('reg_lin', LinearRegression())
        ])
        self.pipe_reg_pois = Pipeline([
            ('poly', PolynomialFeatures(2, include_bias=False)),
            ('scale', StandardScaler()),
            ('reg_pois', PoissonRegressor(alpha=0, max_iter=5000))
        ])
        self.pipe_reg_gamm = Pipeline([
            ('poly', PolynomialFeatures(2, include_bias=False)),
            ('scale', StandardScaler()),
            ('reg_gamm', GammaRegressor(alpha=0, max_iter=5000))
        ])
        # initial data values for checking estimators fit ?
        self.x = None
        self.y = None
        self.x_ar = None
        self.y_ar = None
        # dictionary for results.
        self.results = {}
    def test_gamma_regressor(self):

        model = GammaRegressor()
        X = np.array([[1, 2], [2, 3], [3, 4], [4, 3]])
        y = np.array([19, 26, 33, 30])
        model.fit(X, y)
        test_x = np.array([[1, 0], [2, 8]])

        model_onnx = convert_sklearn(
            model,
            "scikit-learn Gamma Regressor",
            [("input", FloatTensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET)

        self.assertIsNotNone(model_onnx)
        dump_data_and_model(test_x.astype(np.float32),
                            model,
                            model_onnx,
                            basename="SklearnGammaRegressor")
def get_regressors_generalized(nmodels='all'):
    """
		Returns one or all of Generalized linear regressors 
	"""
    # 1. PoissonRegressor
    lr1 = PoissonRegressor()

    # 2. TweedieRegressor
    lr2 = TweedieRegressor()

    # 3. GammaRegressor
    lr3 = GammaRegressor()

    if (nmodels == 'all'):
        models = [lr1, lr2, lr3]
    else:
        models = ['lr' + str(nmodels)]

    return models
###########################################################
############### MODEL FITTING ############################# 
###########################################################

#######################
#######FIT BASIC MODEL#
#######################
## GLM GAMMA
import statsmodels.api as sm
model = sm.GLM(y_train, X_train, family=sm.families.Gamma(link = sm.genmod.families.links.log)).fit() '''Can't have outliers' '''
model.fit().summary()


from sklearn.linear_model import GammaRegressor
from sklearn.model_selection import cross_val_score
ga = GammaRegressor()
ga.fit(X_train, y_train

#np.mean(cross_val_score(lm,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))
mse = np.mean(cross_val_score(ga,X_train,y_train, scoring = 'neg_mean_squared_error', cv = 5))
rmse = np.sqrt(mse*-1)
print(rmse)



##EXPORT PREDICTIONS
y_pred=model.predict(df_dum_test)
y_df = pd.DataFrame(data=y_pred)
df = pd.concat([df_test['Obs_ID'], y_df], axis=1, )
df.to_csv('ypredictedgammaloglink.csv')
def regression_data():
    X, y = make_regression(n_samples=107,
                           n_features=10,
                           n_informative=80,
                           noise=0.5,
                           random_state=2)
    return X, y


@pytest.fixture(
    params=itertools.product(
        ["long", "wide"],
        [
            BinomialRegressor(),
            PoissonRegressor(),
            GammaRegressor(),
            # TweedieRegressor(power=3.0),  # too difficult
            # TweedieRegressor(power=0, link="log"),  # too difficult
            TweedieRegressor(power=1.5),
        ],
    ),
    ids=lambda param: f"{param[0]}-{param[1]}",
)
def glm_dataset(global_random_seed, request):
    """Dataset with GLM solutions, well conditioned X.

    This is inspired by ols_ridge_dataset in test_ridge.py.

    The construction is based on the SVD decomposition of X = U S V'.

    Parameters
Exemple #7
0
    # the power attribute is properly updated
    power = 2.0
    est = TweedieRegressor(power=power)
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == power
    assert est.power == power

    new_power = 0
    new_family = TweedieDistribution(power=new_power)
    est.family = new_family
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == new_power
    assert est.power == new_power

    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
    with pytest.raises(TypeError, match=msg):
        est.family = None


@pytest.mark.parametrize(
    "estimator, value",
    [
        (PoissonRegressor(), True),
        (GammaRegressor(), True),
        (TweedieRegressor(power=1.5), True),
        (TweedieRegressor(power=0), False),
    ],
)
def test_tags(estimator, value):
    assert estimator._get_tags()["requires_positive_y"] is value
Exemple #8
0
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)

if "Visit" in datasets:
	build_visit(GammaRegressor(), "GammaRegressionVisit")
	build_visit(PoissonRegressor(), "PoissonRegressionVisit")

#
# Outlier detection
#

def build_iforest_housing(iforest, name, **pmml_options):
	mapper = DataFrameMapper([
		(housing_X.columns.values, ContinuousDomain())
	])
	pipeline = Pipeline([
		("mapper", mapper),
		("estimator", iforest)
	])
	pipeline.fit(housing_X)
# ------------------------------------
# The mean claim amount or severity (`AvgClaimAmount`) can be empirically
# shown to follow approximately a Gamma distribution. We fit a GLM model for
# the severity with the same features as the frequency model.
#
# Note:
#
# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
#   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
# - We use ``ClaimNb`` as `sample_weight` to account for policies that contain
#   more than one claim.

mask_train = df_train["ClaimAmount"] > 0
mask_test = df_test["ClaimAmount"] > 0

glm_sev = GammaRegressor(alpha=10.0, max_iter=10000)

glm_sev.fit(
    X_train[mask_train.values],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

scores = score_estimator(
    glm_sev,
    X_train[mask_train.values],
    X_test[mask_test.values],
    df_train[mask_train],
    df_test[mask_test],
    target="AvgClaimAmount",
    weights="ClaimNb",
def gamma_model(xs, ys):
    model = GammaRegressor().fit(xs, ys)
    return model
Exemple #11
0

def test_tweedie_regression_family(regression_data):
    # Make sure the family attribute is always a TweedieDistribution and that
    # the power attribute is properly updated
    power = 2.0
    est = TweedieRegressor(power=power)
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == power
    assert est.power == power

    new_power = 0
    new_family = TweedieDistribution(power=new_power)
    est.family = new_family
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == new_power
    assert est.power == new_power

    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
    with pytest.raises(TypeError, match=msg):
        est.family = None


@pytest.mark.parametrize(
    'estimator, value',
    [(PoissonRegressor(), True), (GammaRegressor(), True),
     (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False)],
)
def test_tags(estimator, value):
    assert estimator._get_tags()['requires_positive_y'] is value