def test_tweedie_score(regression_data, power, link): """Test that GLM score equals d2_tweedie_score for Tweedie losses.""" X, y = regression_data # make y positive y = np.abs(y) + 1.0 glm = TweedieRegressor(power=power, link=link).fit(X, y) assert glm.score(X, y) == pytest.approx( d2_tweedie_score(y, glm.predict(X), power=power))
def regression(linkfunc, x, y, test): reg = TweedieRegressor(power=POWER, alpha=ALPHA, link=linkfunc) # reshaping when there is only 1 feature (= dependent variable) if len(x.shape) == 1: x = x.reshape(-1, 1) # ! convert to "column" vector # as data should be in rows test = test.reshape(-1, 1) variable_cnt = x.shape[1] plur = "s" if variable_cnt > 1 else "" print() print('generalized linear regression parametrized as') print(f' -- link = \'{linkfunc}\'') print(f' -- {variable_cnt} dependent variable{plur}') print(reg) print() print(f'train: {x} -> {y}') print(f'test: {test} -> ???') reg.fit(x, y) predicted = reg.predict(test) print() print('predicted:') print(predicted) print() print('y = reg.coef_ * x + reg.intercept_') print(f'reg.coef_ = {reg.coef_}') print(f'reg.intercept_ = {reg.intercept_:.2f}') for t in test: x_val = t y_val = reg.coef_ * t + reg.intercept_ print(f'{x_val} -> {y_val}') strs = [] if variable_cnt > 1: strs.append('sum') if linkfunc != 'identity': strs.append(f'inverse of link function \'{linkfunc}\'') basic_str = 'to be applied!' if len(strs) > 0: print(' and '.join(strs), basic_str) print() print(f'n_iter_ = {reg.n_iter_}') print()
def tweedie_test(X_train, y_train, X_test, y_test, pwr, alf): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=pwr, alpha=alf) # 0 = normal distribution # Fit Model tw.fit(X_train, y_train) # Make Predictions tw_pred = tw.predict(X_test) # Compute root mean squared error tw_MAE = mean_absolute_error(y_test, tw_pred) return tw_MAE, tw, tw_pred
def tweedie(X_train_scaled, y_train): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=0, alpha=.001) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_train_scaled) # Compute root mean squared error tw_rmse = sqrt(mean_squared_error(y_train, tw_pred)) return tw_rmse
def tweedie05(X_train_scaled, y_train): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=0, alpha=.5) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_train_scaled) # Compute root mean squared error tw_MAE = mean_absolute_error(y_train, tw_pred) return tw_MAE
def tweedie_vt(X_train_scaled, X_validate_scaled, y_train, y_validate): ''' runs tweedie algorithm on validate and test but fits model on train ''' # Make Model tw = TweedieRegressor(power=0, alpha=0.001) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_validate_scaled) # Compute root mean squared error tw_rmse = sqrt(mean_squared_error(y_validate, tw_pred)) return tw_rmse
def train_model_normalize(document, drop): std = StandardScaler() drop.append('charges') datos_normalizados = std.fit_transform(document) dataframe_normalizado = pd.DataFrame(datos_normalizados, index=document.index, columns=document.columns) # print(dataframe_normalizado) x = dataframe_normalizado.drop(drop, 1) y = dataframe_normalizado['charges'] x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1) #reg = LinearRegression().fit(x_train, y_train) #reg = Ridge(alpha=100).fit(x_train, y_train) #br = BayesianRidge() #reg = br.fit(x_train, y_train) #reg = Lasso(alpha=0.0000005, fit_intercept=False, tol=0.000000000000001, # max_iter=1000000000).fit(x_train, y_train) reg = TweedieRegressor(alpha=0.1).fit(x_train, y_train) corr = x_train.corr() sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True) return reg, x_test, y_test
def __params_pipe(**glm_pars): glm_pars.update(self.params) return Pipeline([ ('scaler', StandardScaler(with_mean=self.standardize, with_std=self.standardize)), ('glm', TweedieRegressor(**glm_pars)) ])
def glm_model(X_tr, y_tr, X_v, y_v, X_te, y_te, d_str, **kwargs): ''' Generalized Linear Model with a Tweedie distribution. This estimator can be used to model different GLMs depending on the power parameter, which determines the underlying distribution. ''' # create the model object glm = TweedieRegressor(**kwargs) # fit the model to our training data glm.fit(X_tr, y_tr) # predict on train glm_pred = glm.predict(X_tr) # compute root mean squared error glm_rmse = sqrt(mean_squared_error(y_tr, glm_pred)) # predict on validate glm_pred_v = glm.predict(X_v) # compute root mean squared error glm_rmse_v = sqrt(mean_squared_error(y_v, glm_pred_v)) # predict on test glm_pred_t = glm.predict(X_te) # compute root mean squared error glm_rmse_t = sqrt(mean_squared_error(y_te, glm_pred_t)) print(f'RMSE for GLM using {d_str} Distribution \n') print('On train data:\n', round(glm_rmse, 6), '\n') # print(glm_rmse_v) return glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t
def test_tweedie_regression_family(regression_data): # Make sure the family attribute is always a TweedieDistribution and that # the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None
def fit(self, X, y=None, sample_weight=None): response = X.columns[0] if not self.response else self.response self.model_ = DevelopmentML(Pipeline(steps=[ ('design_matrix', PatsyFormula(self.design_matrix)), ('model', TweedieRegressor( link=self.link, power=self.power, max_iter=self.max_iter, tol=self.tol, warm_start=self.warm_start, verbose=self.verbose, fit_intercept=False))]), y_ml=response, weight_ml=self.weight).fit(X) return self
def hurdle(x, y, log=True, max_iter=1000): x, y = remove_nans(x, y) n_obs = len(x) clf = LogisticRegression(fit_intercept=True, penalty='none', max_iter=max_iter) if log: reg = TweedieRegressor(fit_intercept=True, power=0, link='log', alpha=0, tol=1e-8, max_iter=max_iter) else: reg = LinearRegression(fit_intercept=True) clf.fit(x, y > 0) reg.fit(x[y > 0, :], y[y > 0]) return HurdleModel(clf, reg, n_obs, log=log, x=x, y=y)
def get_regressors_generalized(nmodels='all'): """ Returns one or all of Generalized linear regressors """ # 1. PoissonRegressor lr1 = PoissonRegressor() # 2. TweedieRegressor lr2 = TweedieRegressor() # 3. GammaRegressor lr3 = GammaRegressor() if (nmodels == 'all'): models = [lr1, lr2, lr3] else: models = ['lr' + str(nmodels)] return models
def test_tweedie_link_argument(name, link_class): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = TweedieRegressor(power=1, link=name).fit(X, y) assert isinstance(glm._base_loss.link, link_class) glm = TweedieRegressor(power=1, link="not a link") with pytest.raises( ValueError, match=re.escape( "The link must be an element of ['auto', 'identity', 'log']"), ): glm.fit(X, y)
def cvTweedie(X_train, y_train, pwr, alf): # Tweedie Regressor # create loocv procedure cvTW = LeaveOneOut() # create model modelTW = TweedieRegressor(power=pwr, alpha=alf) # 0 = normal distribution # evaluate model scoresTW = cross_val_score(modelTW, X_train, y_train, scoring='neg_mean_absolute_error', cv=cvTW, n_jobs=-1) # force positive scoresTW = absolute(scoresTW) # report performance print('MAE: %.3f (%.3f)' % (mean(scoresTW), std(scoresTW))) meanMAE = mean(scoresTW) stddevMAE = std(scoresTW) return meanMAE
def sk_tweedie_regression(X_train, X_test, y_train, y_test, set_model='linear'): if set_model == 'Poisson': reg = TweedieRegressor( alpha=0, power=1, # Poisson distribution link='log', fit_intercept=False, max_iter=300) elif set_model == 'linear': reg = TweedieRegressor( alpha=0, power=0, # Normal distribution link='identity', fit_intercept=False, max_iter=300) else: print('Set the correct name.') return reg.fit(X_train, y_train) print('score: ', reg.score(X_test, y_test)) y_hat = reg.predict(X) fig = plt.figure(figsize=(6.0, 6.0)) plt.plot(X, y, 'o') plt.plot(X, y_hat, '*', color='r') plt.xlabel('x (total_bill)') plt.ylabel('y (tips)') plt.xlim(0, 60) plt.ylim(0, 12) plt.show()
def get_pipe(self): try: if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv if self.est_kwargs is None: self.est_kwargs = { 'reg__alpha': np.logspace(-5, 10, self.gridpoints).tolist( ), #investigate the ideal range for alpha 'reg__power': [0, *np.logspace(1, 3, self.gridpoints - 1).tolist() ], #investigate power values between 2 and 3 'select__max_k': [4, 8, 32] } #maybe look to tweak using k_share steps = [('scaler', StandardScaler()), ('select', shrinkBigKTransformer(max_k=8)), ('reg', TweedieRegressor())] if self.bestT: steps.insert( 0, 'xtransform', columnBestTransformer(float_k=len(self.float_idx))) outerpipe = GridSearchCV(Pipeline(steps=steps), param_grid=self.est_kwargs, cv=inner_cv) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe except: self.logger.exception(f'get_pipe error for flexibleGLM')
# implementation does not allow for this (yet). # # We will compare the performance of both approaches. # To quantify the performance of both models, one can compute # the mean deviance of the train and test data assuming a Compound # Poisson-Gamma distribution of the total claim amount. This is equivalent to # a Tweedie distribution with a `power` parameter between 1 and 2. # # The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power` # parameter. As we do not know the true value of the `power` parameter, we here # compute the mean deviances for a grid of possible values, and compare the # models side by side, i.e. we compare them at identical values of `power`. # Ideally, we hope that one model will be consistently better than the other, # regardless of `power`. glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000) glm_pure_premium.fit(X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]) tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores_product_model = score_estimator( (glm_freq, glm_sev), X_train, X_test, df_train, df_test, target="PurePremium", weights="Exposure", tweedie_powers=tweedie_powers,
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split): ''' # main program # input: radius: %+.3f, 'str' (in makefile, str is default) # path: file storage path, 'str' # fout: file output name as .h5, 'str' (.h5 not included') # cut_max: cut off of Legendre # output: the gathered result EventID, ChannelID, x, y, z ''' if pre != 'r': print('begin reading file', flush=True) EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename) VertexTruth = (np.vstack((x, y, z))/1e3).T if(offset): off = pub.LoadBase(offset) else: off = np.zeros_like(PMTPos[:,0]) print('total event: %d' % np.size(np.unique(EventID)), flush=True) print('begin processing legendre coeff', flush=True) # this part for the same vertex tmp = time.time() EventNo = np.size(np.unique(EventID)) PMTNo = np.size(PMTPos[:,0]) if mode == 'PE': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) elif mode == 'time': counts = np.bincount(EventID) counts = counts[counts!=0] PMTPosRep = PMTPos[ChannelID] vertex = np.repeat(VertexTruth, counts, axis=0) elif mode == 'combined': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) if basis == 'Legendre': X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True) elif basis == 'Zernike': from zernike import RZern cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False) cart = RZern(order) nk = cart.nk m = cart.mtab n = cart.ntab rho = np.linalg.norm(vertex, axis=1)/0.65 theta = np.arccos(cos_theta) X = np.zeros((rho.shape[0], nk)) for i in np.arange(nk): if not i % 5: print(f'process {i}-th event') X[:,i] = cart.Zk(i, rho, theta) X = X[:,m>=0] print(f'rank: {np.linalg.matrix_rank(X)}') print(f'use {time.time() - tmp} s') # which info should be used if mode == 'PE': y = Q elif mode == 'time': y = PulseTime elif mode == 'combined': # PulseTime = PulseTime - np.min(PulseTime) # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 # print(np.min(PulseTime), np.max(PulseTime)) PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 bins = np.arange(-1, 0.05, 0.1) N = 10 # Legendre coeff x = pub.legval(bins, np.eye(N).reshape(N, N, 1)) # 1st basis Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T # 2nd basis X = np.repeat(X, bins.shape[0], axis=0) # output y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins))) ''' basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1])) for i_index, i in enumerate(np.arange(X.shape[1])): for j_index, j in enumerate(np.arange(Y.shape[1])): total_index = i_index*Y.shape[1] + j_index if not total_index % 10: print(total_index) basis[:, total_index] = X[:,i_index]*Y[:,j_index] X = basis ''' split_index = np.unique(EventID).shape[0] for k_index, k in enumerate(np.unique(EventID)): # event begin with 1 if k_index > split_index * split: break if not k % 100: print(k) index = EventID == k CID = ChannelID[index] Pulse_t = PulseTime[index] for i in np.unique(CID): # PMT begin with 0 y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins) y = np.reshape(y,(-1)) if verbose: print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}') if pre =='w': if split != 1: split_index = np.int(split*y.shape[0]) X = X[:split_index] Y = Y[:split_index] y = y[:split_index] import pandas as pd import pyarrow as pa import pyarrow.parquet as pq y = np.atleast_2d(y).T #data = np.hstack((X, y, np.ones_like(y))) df_X = pd.DataFrame(X) X_names = [] for i in df_X.columns: X_names.append('X' + str(i)) df_X.columns = X_names df_Y = pd.DataFrame(Y) Y_names = [] for i in df_Y.columns: Y_names.append('Y' + str(i)) df_Y.columns = Y_names df_y = pd.DataFrame(y) df_y.columns = ['output'] df = pd.concat([df_X, df_Y, df_y], axis=1) table = pa.Table.from_pandas(df) pq.write_table(table, 'test1.parquet') return if not pre: # Regression methods: if alg == 'sms': import statsmodels.api as sm if mode == 'PE': model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False) result = model.fit() if verbose: print(result.summary()) AIC = result.aic coef_ = result.params std = result.bse elif mode == 'time': import pandas as pd data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T))) strs = 'y ~ ' start = data.keys().start stop = data.keys().stop step = data.keys().step cname = [] cname.append('X0') for i in np.arange(start+1, stop, step): if i == start + 1: strs += 'X%d ' % i elif i == stop - step: pass else: strs += ' + X%d ' % i if i == stop - step: cname.append('y') else: cname.append('X%d' % i) data.columns = cname mod = sm.formula.quantreg(strs, data[cname]) result = mod.fit(q=qt,) coef_ = result.params AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif mode == 'combined': # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T))) with h5py.File(output,'w') as out: out.create_dataset('X', data = X) out.create_dataset('Y', data = y) print('begin...') model = sm.GLM(y, X, family=sm.families.Poisson()) result = model.fit() if verbose: print(result.summary()) coef_ = result.params std = result.bse AIC = result.aic if verbose: print(result.summary()) elif (alg == 'custom'): from scipy.optimize import minimize x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order) if mode == 'PE': x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) elif mode == 'time': x0[0] = np.mean(y) qt = 0.1 ts = 2.6 result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts)) elif mode == 'combined': x0 = np.zeros_like(X[0]) x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) coef_ = np.array(result.x, dtype=float) if verbose: print(result.message) AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X)) # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime)) # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1))) print(coef_) # print(std) print('Waring! No AIC and std value, std is testing') elif alg == 'sk': from sklearn.linear_model import TweedieRegressor alpha = 0.001 reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False) reg.fit(X, y) # just for point data # pred = reg.predict(X[0:30,0:cut+1]) print('coeff:\n', reg.coef_,'\n') coef_ = reg.coef_ AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif alg == 'h2o': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator if mode != 'combined': y = np.atleast_2d(y).T data = np.hstack((X, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = glm_model.coef() elif mode == 'time': gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234, stopping_metric = "mse", stopping_tolerance = 1e-4) gbm.train(x = predictors, y = response_col, training_frame = hf) breakpoint() print(gbm) exit() elif mode == 'combined': y = np.atleast_2d(y).T data = np.hstack((X, Y, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if verbose: print(coef_) if basis == 'Zernike': print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}') coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() h2o.cluster().shutdown() elif pre == 'r': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() hf = h2o.import_file("electron-1.parquet") pairs = [] for i in hf.columns: for j in hf.columns: if (i.startswith('Z') and j.startswith('L')): if ((i!='X0') and (j != 'Y0')): pairs.append((i,j)) predictors = hf.columns[2:] response_col = hf.columns[0] print(predictors) print(response_col) print(pairs) if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) elif mode == 'combined': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, interaction_pairs=pairs, lambda_ = 0, #remove_collinear_columns = True, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) breakpoint() coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() print(f'Regession coef is f{np.array(coef_)}') if (figure=='ON'): import matplotlib.pyplot as plt L, K = 500, 500 ddx = np.linspace(-1.0, 1.0, K) ddy = np.linspace(-1.0, 1.0, L) xv, yv = np.meshgrid(ddx, ddy) cart.make_cart_grid(xv, yv) # normal scale # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1)) # log scale im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1)) plt.colorbar() plt.savefig('test.png') else: print('error regression algorithm') with h5py.File(output,'w') as out: out.create_dataset('coeff' + str(order), data = coef_) out.create_dataset('std' + str(order), data = std) out.create_dataset('AIC' + str(order), data = AIC)
# print(gks_test) gks_x = gks.iloc[:, :-1].values gks_y = gks.iloc[:, -1].values gks_x_test = gks_test.iloc[:, :-1].values gks_y_test = gks_test.iloc[:, -1].values scaler = StandardScaler() gks_x = scaler.fit_transform(gks_x) # reg = SVR(C=10, epsilon=0.2) reg = TweedieRegressor(power=1, alpha=0.5, link='log') reg.fit(gks_x, gks_y) gks_x_test = scaler.transform(gks_x_test) preds = reg.predict(gks_x_test) print(mean_squared_error(gks_y_test, preds)) # print(gks_test_names) with open('gks.csv', 'w') as file: for idx, val in enumerate(preds): file.write(gks_test_names.iloc[idx]['web_name'] + "," + str(val) + "," + str(gks_y_test[idx])) file.write('\n')
def __init__(self, data, col_num): super().__init__(data, col_num) self.model = TweedieRegressor(power=2, link='log') self._spread = 0.01
def test_tweedie_link_auto(power, expected_link_class): """Test that link='auto' delivers the expected link function""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = TweedieRegressor(link="auto", power=power).fit(X, y) assert isinstance(glm._base_loss.link, expected_link_class)
def test_tweedie_link_argument(name, link_class): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = TweedieRegressor(power=1, link=name).fit(X, y) assert isinstance(glm._base_loss.link, link_class)
n_informative=80, noise=0.5, random_state=2) return X, y @pytest.fixture( params=itertools.product( ["long", "wide"], [ BinomialRegressor(), PoissonRegressor(), GammaRegressor(), # TweedieRegressor(power=3.0), # too difficult # TweedieRegressor(power=0, link="log"), # too difficult TweedieRegressor(power=1.5), ], ), ids=lambda param: f"{param[0]}-{param[1]}", ) def glm_dataset(global_random_seed, request): """Dataset with GLM solutions, well conditioned X. This is inspired by ols_ridge_dataset in test_ridge.py. The construction is based on the SVD decomposition of X = U S V'. Parameters ---------- type : {"long", "wide"} If "long", then n_samples > n_features.
tick.label.set_fontsize(6) axes.tick_params(width=4) # change all spines for axis in ['top','bottom','left','right']: axes.spines[axis].set_linewidth(6) #%% from sklearn.linear_model import TweedieRegressor X = np.array(x).reshape(-1,1) Y = np.array(y) pr = TweedieRegressor(power = 1, alpha=0, fit_intercept=True) y_pred_pr = pr.fit(X, Y).predict(X) fig, axes = utils.plot_make(size_length=5) sns.scatterplot(data = sc_vs_quickness_group_fill, x = "sc_LR_mean", y= "inverse_quickness", linewidth=0, s=100) sns.lineplot(x = X.flatten(), y = y_pred_pr) pr.score(X, Y) #% X2 = sm.add_constant(X) glm = sm.GLM(Y, X2, family=sm.families.Tweedie())
def tweedie_regression(): reg = TweedieRegressor(power=1, alpha=0.5, link='log') reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) print(reg.coef_) print(reg.intercept_)
# the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None @pytest.mark.parametrize( "estimator, value", [ (PoissonRegressor(), True), (GammaRegressor(), True), (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False), ], ) def test_tags(estimator, value): assert estimator._get_tags()["requires_positive_y"] is value
def all_models_info(): '''takes in data sets baseline sets SSE, MSE, and RMSE returns infor for all 4''' # get data df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) # pull from add to trian train = evaluate.add_to_train() X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest( ) #OLS Model lm = LinearRegression(normalize=True) lm.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_lm'] = lm.predict(X_train) rmse_train_lm = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2) y_validate['appraised_value_pred_lm'] = lm.predict(X_validate) rmse_validate_lm = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lm)**(1 / 2) #LARS Model lars = LassoLars(alpha=1.0) lars.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_lars'] = lars.predict(X_train) rmse_train_lars = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2 y_validate['appraised_value_pred_lars'] = lars.predict(X_validate) rmse_validate_lars = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lars)**1 / 2 #GLM glm = TweedieRegressor(power=1, alpha=0) glm.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_glm'] = glm.predict(X_train) rmse_train_glm = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2 y_validate['appraised_value_pred_glm'] = glm.predict(X_validate) rmse_validate_glm = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2 # PF pf = PolynomialFeatures(degree=2) X_train_degree2 = pf.fit_transform(X_train) X_validate_degree2 = pf.transform(X_validate) X_test_degree2 = pf.transform(X_test) # LM2 lm2 = LinearRegression(normalize=True) lm2.fit(X_train_degree2, y_train.appraised_value) y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2) rmse_train_lm2 = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2 y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2) rmse_validate_lm2 = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2 print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm) print("--------------------------------------------------------------") print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars, "\nValidation/Out-of-Sample: ", rmse_validate_lars) print("--------------------------------------------------------------") print( "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm) print("--------------------------------------------------------------") print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
inv_map = {v: k for k, v in eco_vec_map.items()} df_eco.columns = df_eco.columns.to_series().map(inv_map) df_eco.index.names = ['Date'] # Extract eco data from Sep 2018 to Jan 2020 df_eco_sel = df_eco.loc['2018-09-01':'2020-01-31'] # put together eco and transaction counts for regression df_all = pd.concat([df_eco_sel, df_period.set_index(df_eco_sel.index)], axis=1) y_train = df_all['Transaction_Count'].values X_train = df_all[[ 'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX' ]] # generalized linear model glm = TweedieRegressor(power=1, alpha=0.5, link='log') # Poisson distribution scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) glm.fit(X_train_scaled, y_train) # predict eco data for given year and month df_future = pd.DataFrame(columns=['Date']) for i, eco_var in enumerate(list(eco_vec_map.keys())): print("Forecasting " + eco_var + ' ' + str(Y) + ' ' + datetime.strptime(str(M), "%m").strftime("%b")) tmp = forecast_eco(df_eco, eco_var, Y, M) tmp = tmp[['ds', 'trend']] tmp.rename(columns={'ds': 'Date', 'trend': eco_var}, inplace=True) df_future = df_future.merge(tmp, on='Date', how='right') # predict transaction count using the glm model
def tweedieregressor(self,X_train,X_test,y_train,y_test): regressor= TweedieRegressor() regfit=regressor.fit(self.X_train,self.y_train) return regressor.predict(self.X_test)