def test_gamma_regression_family(regression_data): # Make sure the family attribute is read-only to prevent searching over it # e.g. in a grid search est = GammaRegressor() est.family == "gamma" msg = "GammaRegressor.family must be 'gamma'!" with pytest.raises(ValueError, match=msg): est.family = 0
def __init__(self, correct_glm_bounds=True, recursive_forecast=False): # optional parameters self.correct_glm_bounds = correct_glm_bounds self.recursive_forecast = recursive_forecast # pipelines for the models. # Scaling for Poisson and Gamma Regression models, they use L2 regularization penalty self.pipe_lin_reg_ar = Pipeline([ ('poly', PolynomialFeatures(1, include_bias=False)), ('scale', StandardScaler()), ('reg_lin', LinearRegression()) ]) self.pipe_reg_pois = Pipeline([ ('poly', PolynomialFeatures(2, include_bias=False)), ('scale', StandardScaler()), ('reg_pois', PoissonRegressor(alpha=0, max_iter=5000)) ]) self.pipe_reg_gamm = Pipeline([ ('poly', PolynomialFeatures(2, include_bias=False)), ('scale', StandardScaler()), ('reg_gamm', GammaRegressor(alpha=0, max_iter=5000)) ]) # initial data values for checking estimators fit ? self.x = None self.y = None self.x_ar = None self.y_ar = None # dictionary for results. self.results = {}
def test_gamma_regressor(self): model = GammaRegressor() X = np.array([[1, 2], [2, 3], [3, 4], [4, 3]]) y = np.array([19, 26, 33, 30]) model.fit(X, y) test_x = np.array([[1, 0], [2, 8]]) model_onnx = convert_sklearn( model, "scikit-learn Gamma Regressor", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(test_x.astype(np.float32), model, model_onnx, basename="SklearnGammaRegressor")
def get_regressors_generalized(nmodels='all'): """ Returns one or all of Generalized linear regressors """ # 1. PoissonRegressor lr1 = PoissonRegressor() # 2. TweedieRegressor lr2 = TweedieRegressor() # 3. GammaRegressor lr3 = GammaRegressor() if (nmodels == 'all'): models = [lr1, lr2, lr3] else: models = ['lr' + str(nmodels)] return models
########################################################### ############### MODEL FITTING ############################# ########################################################### ####################### #######FIT BASIC MODEL# ####################### ## GLM GAMMA import statsmodels.api as sm model = sm.GLM(y_train, X_train, family=sm.families.Gamma(link = sm.genmod.families.links.log)).fit() '''Can't have outliers' ''' model.fit().summary() from sklearn.linear_model import GammaRegressor from sklearn.model_selection import cross_val_score ga = GammaRegressor() ga.fit(X_train, y_train #np.mean(cross_val_score(lm,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)) mse = np.mean(cross_val_score(ga,X_train,y_train, scoring = 'neg_mean_squared_error', cv = 5)) rmse = np.sqrt(mse*-1) print(rmse) ##EXPORT PREDICTIONS y_pred=model.predict(df_dum_test) y_df = pd.DataFrame(data=y_pred) df = pd.concat([df_test['Obs_ID'], y_df], axis=1, ) df.to_csv('ypredictedgammaloglink.csv')
def regression_data(): X, y = make_regression(n_samples=107, n_features=10, n_informative=80, noise=0.5, random_state=2) return X, y @pytest.fixture( params=itertools.product( ["long", "wide"], [ BinomialRegressor(), PoissonRegressor(), GammaRegressor(), # TweedieRegressor(power=3.0), # too difficult # TweedieRegressor(power=0, link="log"), # too difficult TweedieRegressor(power=1.5), ], ), ids=lambda param: f"{param[0]}-{param[1]}", ) def glm_dataset(global_random_seed, request): """Dataset with GLM solutions, well conditioned X. This is inspired by ols_ridge_dataset in test_ridge.py. The construction is based on the SVD decomposition of X = U S V'. Parameters
# the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None @pytest.mark.parametrize( "estimator, value", [ (PoissonRegressor(), True), (GammaRegressor(), True), (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False), ], ) def test_tags(estimator, value): assert estimator._get_tags()["requires_positive_y"] is value
[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name) if "Visit" in datasets: build_visit(GammaRegressor(), "GammaRegressionVisit") build_visit(PoissonRegressor(), "PoissonRegressionVisit") # # Outlier detection # def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([ (housing_X.columns.values, ContinuousDomain()) ]) pipeline = Pipeline([ ("mapper", mapper), ("estimator", iforest) ]) pipeline.fit(housing_X)
# ------------------------------------ # The mean claim amount or severity (`AvgClaimAmount`) can be empirically # shown to follow approximately a Gamma distribution. We fit a GLM model for # the severity with the same features as the frequency model. # # Note: # # - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support # on :math:`(0, \infty)`, not :math:`[0, \infty)`. # - We use ``ClaimNb`` as `sample_weight` to account for policies that contain # more than one claim. mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 glm_sev = GammaRegressor(alpha=10.0, max_iter=10000) glm_sev.fit( X_train[mask_train.values], df_train.loc[mask_train, "AvgClaimAmount"], sample_weight=df_train.loc[mask_train, "ClaimNb"], ) scores = score_estimator( glm_sev, X_train[mask_train.values], X_test[mask_test.values], df_train[mask_train], df_test[mask_test], target="AvgClaimAmount", weights="ClaimNb",
def gamma_model(xs, ys): model = GammaRegressor().fit(xs, ys) return model
def test_tweedie_regression_family(regression_data): # Make sure the family attribute is always a TweedieDistribution and that # the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None @pytest.mark.parametrize( 'estimator, value', [(PoissonRegressor(), True), (GammaRegressor(), True), (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False)], ) def test_tags(estimator, value): assert estimator._get_tags()['requires_positive_y'] is value