Exemple #1
0
def tweedie_test(X_train, y_train, X_test, y_test, pwr, alf):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=pwr, alpha=alf)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_test)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_test, tw_pred)
    return tw_MAE, tw, tw_pred
def tweedie(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_train, tw_pred))
    return tw_rmse
Exemple #3
0
def tweedie05(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.5)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_train, tw_pred)
    return tw_MAE
def tweedie_vt(X_train_scaled, X_validate_scaled, y_train, y_validate):
    '''
    runs tweedie algorithm on validate and test
    but fits model on train
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=0.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_validate_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_validate, tw_pred))
    return tw_rmse
def test_tweedie_link_argument(name, link_class):
    """Test GLM link argument set as string."""
    y = np.array([0.1, 0.5])  # in range of all distributions
    X = np.array([[1], [2]])
    glm = TweedieRegressor(power=1, link=name).fit(X, y)
    assert isinstance(glm._base_loss.link, link_class)

    glm = TweedieRegressor(power=1, link="not a link")
    with pytest.raises(
            ValueError,
            match=re.escape(
                "The link must be an element of ['auto', 'identity', 'log']"),
    ):
        glm.fit(X, y)
Exemple #6
0
def hurdle(x, y, log=True, max_iter=1000):
    x, y = remove_nans(x, y)
    n_obs = len(x)

    clf = LogisticRegression(fit_intercept=True,
                             penalty='none',
                             max_iter=max_iter)

    if log:
        reg = TweedieRegressor(fit_intercept=True,
                               power=0,
                               link='log',
                               alpha=0,
                               tol=1e-8,
                               max_iter=max_iter)
    else:
        reg = LinearRegression(fit_intercept=True)

    clf.fit(x, y > 0)
    reg.fit(x[y > 0, :], y[y > 0])

    return HurdleModel(clf, reg, n_obs, log=log, x=x, y=y)
def sk_tweedie_regression(X_train,
                          X_test,
                          y_train,
                          y_test,
                          set_model='linear'):
    if set_model == 'Poisson':
        reg = TweedieRegressor(
            alpha=0,
            power=1,  # Poisson distribution
            link='log',
            fit_intercept=False,
            max_iter=300)
    elif set_model == 'linear':
        reg = TweedieRegressor(
            alpha=0,
            power=0,  # Normal distribution
            link='identity',
            fit_intercept=False,
            max_iter=300)
    else:
        print('Set the correct name.')
        return

    reg.fit(X_train, y_train)
    print('score: ', reg.score(X_test, y_test))

    y_hat = reg.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
Exemple #8
0
    # print(gks_test)

    gks_x = gks.iloc[:, :-1].values
    gks_y = gks.iloc[:, -1].values

    gks_x_test = gks_test.iloc[:, :-1].values
    gks_y_test = gks_test.iloc[:, -1].values

    scaler = StandardScaler()

    gks_x = scaler.fit_transform(gks_x)

    # reg = SVR(C=10, epsilon=0.2)

    reg = TweedieRegressor(power=1, alpha=0.5, link='log')

    reg.fit(gks_x, gks_y)

    gks_x_test = scaler.transform(gks_x_test)
    preds = reg.predict(gks_x_test)

    print(mean_squared_error(gks_y_test, preds))

    # print(gks_test_names)

    with open('gks.csv', 'w') as file:
        for idx, val in enumerate(preds):
            file.write(gks_test_names.iloc[idx]['web_name'] + "," + str(val) +
                       "," + str(gks_y_test[idx]))
            file.write('\n')
Exemple #9
0
def all_models_info():
    '''takes in data
    sets baseline
    sets SSE, MSE, and RMSE
    returns infor for all 4'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    #LARS Model
    lars = LassoLars(alpha=1.0)
    lars.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lars'] = lars.predict(X_train)
    rmse_train_lars = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2
    y_validate['appraised_value_pred_lars'] = lars.predict(X_validate)
    rmse_validate_lars = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lars)**1 / 2
    #GLM
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_glm'] = glm.predict(X_train)
    rmse_train_glm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2
    y_validate['appraised_value_pred_glm'] = glm.predict(X_validate)
    rmse_validate_glm = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2
    # PF
    pf = PolynomialFeatures(degree=2)
    X_train_degree2 = pf.fit_transform(X_train)
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # LM2
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train.appraised_value)
    y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2)
    rmse_train_lm2 = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2
    y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2)
    rmse_validate_lm2 = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
    print("--------------------------------------------------------------")
    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars,
          "\nValidation/Out-of-Sample: ", rmse_validate_lars)
    print("--------------------------------------------------------------")
    print(
        "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ",
        rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm)
    print("--------------------------------------------------------------")
    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ",
          rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
# We will compare the performance of both approaches.
# To quantify the performance of both models, one can compute
# the mean deviance of the train and test data assuming a Compound
# Poisson-Gamma distribution of the total claim amount. This is equivalent to
# a Tweedie distribution with a `power` parameter between 1 and 2.
#
# The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power`
# parameter. As we do not know the true value of the `power` parameter, we here
# compute the mean deviances for a grid of possible values, and compare the
# models side by side, i.e. we compare them at identical values of `power`.
# Ideally, we hope that one model will be consistently better than the other,
# regardless of `power`.

glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
glm_pure_premium.fit(X_train,
                     df_train["PurePremium"],
                     sample_weight=df_train["Exposure"])

tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]

scores_product_model = score_estimator(
    (glm_freq, glm_sev),
    X_train,
    X_test,
    df_train,
    df_test,
    target="PurePremium",
    weights="Exposure",
    tweedie_powers=tweedie_powers,
)
Exemple #11
0
# Extract eco data from Sep 2018 to Jan 2020
df_eco_sel = df_eco.loc['2018-09-01':'2020-01-31']

# put together eco and transaction counts for regression
df_all = pd.concat([df_eco_sel, df_period.set_index(df_eco_sel.index)], axis=1)
y_train = df_all['Transaction_Count'].values
X_train = df_all[[
    'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX'
]]

# generalized linear model
glm = TweedieRegressor(power=1, alpha=0.5, link='log')  # Poisson distribution
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
glm.fit(X_train_scaled, y_train)

# predict eco data for given year and month
df_future = pd.DataFrame(columns=['Date'])
for i, eco_var in enumerate(list(eco_vec_map.keys())):
    print("Forecasting " + eco_var + ' ' + str(Y) + ' ' +
          datetime.strptime(str(M), "%m").strftime("%b"))
    tmp = forecast_eco(df_eco, eco_var, Y, M)
    tmp = tmp[['ds', 'trend']]
    tmp.rename(columns={'ds': 'Date', 'trend': eco_var}, inplace=True)
    df_future = df_future.merge(tmp, on='Date', how='right')

# predict transaction count using the glm model
eco_forecast = df_future.tail(1)[[
    'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX'
]]
Exemple #12
0
 def tweedieregressor(self,X_train,X_test,y_train,y_test):
     
     regressor= TweedieRegressor()
     regfit=regressor.fit(self.X_train,self.y_train)
     return regressor.predict(self.X_test)
axes.tick_params(width=4) 
# change all spines
for axis in ['top','bottom','left','right']:
    axes.spines[axis].set_linewidth(6)



#%%

from sklearn.linear_model import TweedieRegressor
X = np.array(x).reshape(-1,1)
Y = np.array(y)


pr = TweedieRegressor(power = 1, alpha=0, fit_intercept=True)
y_pred_pr = pr.fit(X, Y).predict(X)

fig, axes = utils.plot_make(size_length=5)
sns.scatterplot(data = sc_vs_quickness_group_fill, x = "sc_LR_mean", y= "inverse_quickness", linewidth=0, s=100)
sns.lineplot(x = X.flatten(), y = y_pred_pr)

pr.score(X, Y)




#%

X2 = sm.add_constant(X)
glm = sm.GLM(Y, X2, family=sm.families.Tweedie())
glm_fit = glm.fit()
Exemple #14
0
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split):
    '''
    # main program
    # input: radius: %+.3f, 'str' (in makefile, str is default)
    #        path: file storage path, 'str'
    #        fout: file output name as .h5, 'str' (.h5 not included')
    #        cut_max: cut off of Legendre
    # output: the gathered result EventID, ChannelID, x, y, z
    '''
    if pre != 'r':
        print('begin reading file', flush=True)
        EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename)
        VertexTruth = (np.vstack((x, y, z))/1e3).T
        if(offset):
            off = pub.LoadBase(offset)
        else:
            off = np.zeros_like(PMTPos[:,0])
        print('total event: %d' % np.size(np.unique(EventID)), flush=True)
        print('begin processing legendre coeff', flush=True)
        # this part for the same vertex

        tmp = time.time()
        EventNo = np.size(np.unique(EventID))
        PMTNo = np.size(PMTPos[:,0])
        if mode == 'PE':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)
        elif mode == 'time':
            counts = np.bincount(EventID)
            counts = counts[counts!=0]
            PMTPosRep = PMTPos[ChannelID]
            vertex = np.repeat(VertexTruth, counts, axis=0)
        elif mode == 'combined':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)

        if basis == 'Legendre':
            X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True)
        elif basis == 'Zernike':
            from zernike import RZern
            cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False)
            cart = RZern(order)
            nk = cart.nk
            m = cart.mtab
            n = cart.ntab
            rho = np.linalg.norm(vertex, axis=1)/0.65
            theta = np.arccos(cos_theta)
            X = np.zeros((rho.shape[0], nk))

            for i in np.arange(nk):
                if not i % 5:
                    print(f'process {i}-th event')
                X[:,i] = cart.Zk(i, rho, theta)
            X = X[:,m>=0]
            print(f'rank: {np.linalg.matrix_rank(X)}')    
        print(f'use {time.time() - tmp} s')

        # which info should be used
        if mode == 'PE':
            y = Q
        elif mode == 'time':
            y = PulseTime 
        elif mode == 'combined':
            # PulseTime = PulseTime - np.min(PulseTime)
            # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            # print(np.min(PulseTime), np.max(PulseTime))
            PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            bins = np.arange(-1, 0.05, 0.1)
            N = 10
            # Legendre coeff
            x = pub.legval(bins, np.eye(N).reshape(N, N, 1))
            # 1st basis
            Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T
            # 2nd basis
            X = np.repeat(X, bins.shape[0], axis=0)
            # output
            y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins)))
            '''
            basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1]))
            for i_index, i in enumerate(np.arange(X.shape[1])):
                for j_index, j in enumerate(np.arange(Y.shape[1])):
                    total_index = i_index*Y.shape[1] + j_index
                    if not total_index % 10:
                        print(total_index)
                    basis[:, total_index] = X[:,i_index]*Y[:,j_index]
            X = basis
            '''
            split_index = np.unique(EventID).shape[0]
            for k_index, k in enumerate(np.unique(EventID)): # event begin with 1
                if k_index > split_index * split:
                    break
                if not k % 100:
                    print(k)
                index = EventID == k
                CID = ChannelID[index]
                Pulse_t = PulseTime[index]
                for i in np.unique(CID): # PMT begin with 0
                    y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins)
            y = np.reshape(y,(-1))
        if verbose:
            print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}')
    if pre =='w':
        if split != 1:
            split_index = np.int(split*y.shape[0])
            X = X[:split_index]
            Y = Y[:split_index]
            y = y[:split_index]
        import pandas as pd
        import pyarrow as pa
        import pyarrow.parquet as pq
        y = np.atleast_2d(y).T
        #data = np.hstack((X, y, np.ones_like(y)))
        df_X = pd.DataFrame(X)
        X_names = []
        for i in df_X.columns:
            X_names.append('X' + str(i))
        df_X.columns = X_names    
        
        df_Y = pd.DataFrame(Y)
        Y_names = []
        for i in df_Y.columns:
            Y_names.append('Y' + str(i))
        df_Y.columns = Y_names
        
        df_y = pd.DataFrame(y)
        df_y.columns = ['output']
        df = pd.concat([df_X, df_Y, df_y], axis=1)
        table = pa.Table.from_pandas(df)
        
        pq.write_table(table, 'test1.parquet')
        return

    if not pre:
        # Regression methods:
        if alg == 'sms':
            import statsmodels.api as sm
            if mode == 'PE':
                model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False)
                result = model.fit()
                if verbose:
                    print(result.summary())
                AIC = result.aic
                coef_ = result.params
                std = result.bse
                
            elif mode == 'time':
                import pandas as pd
                data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T)))                
                strs = 'y ~ '
                start = data.keys().start
                stop = data.keys().stop
                step = data.keys().step

                cname = []
                cname.append('X0')
                for i in np.arange(start+1, stop, step):
                    if i == start + 1:
                        strs += 'X%d ' % i
                    elif i == stop - step:
                        pass
                    else:
                        strs += ' + X%d ' % i                      

                    if i == stop - step:
                        cname.append('y')
                    else:
                        cname.append('X%d' % i)
                data.columns = cname

                mod = sm.formula.quantreg(strs, data[cname])

                result = mod.fit(q=qt,)
                coef_ = result.params
                AIC = np.zeros_like(coef_)
                std = np.zeros_like(coef_)           
                print('Waring! No AIC and std value')
            elif mode == 'combined':
                # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T)))  
                with h5py.File(output,'w') as out:        
                    out.create_dataset('X', data = X)
                    out.create_dataset('Y', data = y)
                print('begin...')
                model = sm.GLM(y, X, family=sm.families.Poisson())
                result = model.fit()
                if verbose:
                    print(result.summary())
                coef_ = result.params
                std = result.bse
                AIC = result.aic
            if verbose:
                print(result.summary())

        elif (alg == 'custom'):
            from scipy.optimize import minimize
            x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order)
            
            if mode == 'PE':
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))
            elif mode == 'time':
                x0[0] = np.mean(y)
                qt = 0.1
                ts = 2.6
                result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts))
            elif mode == 'combined':
                x0 = np.zeros_like(X[0])
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))

            coef_ = np.array(result.x, dtype=float)
            if verbose:
                print(result.message)
            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)

            H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X))
            # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime))
            # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1)))
            print(coef_)
            # print(std)
            print('Waring! No AIC and std value, std is testing')

        elif alg == 'sk':
            from sklearn.linear_model import TweedieRegressor
            alpha = 0.001
            reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False)
            reg.fit(X, y)

            # just for point data
            # pred = reg.predict(X[0:30,0:cut+1])

            print('coeff:\n', reg.coef_,'\n')

            coef_ = reg.coef_ 

            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)
            print('Waring! No AIC and std value')

        elif alg == 'h2o':
            import h2o
            from h2o.estimators.gbm import H2OGradientBoostingEstimator
            from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
            if mode != 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, y, np.ones_like(y)))

                h2o.init()
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]

                if mode == 'PE':
                    #offset_col = hf.columns[-1]
                    glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                        #offset_column = offset_col, 
                        lambda_ = 0,
                        compute_p_values = True)

                    glm_model.train(predictors, response_col, training_frame=hf)

                    coef_table = glm_model._model_json['output']['coefficients_table']
                    coef_ = glm_model.coef()

                elif mode == 'time':
                    gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234,
                                                      stopping_metric = "mse", stopping_tolerance = 1e-4)
                    gbm.train(x = predictors, y = response_col, training_frame = hf)
                    breakpoint()
                    print(gbm)
                    exit()
            elif mode == 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, Y, y, np.ones_like(y)))

                h2o.init() 
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]           

            if verbose:
                print(coef_)
                if basis == 'Zernike':
                    print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}')
            coef_ = coef_table['coefficients']
            std = coef_table['std_error']
            AIC = glm_model.aic()

            h2o.cluster().shutdown()

    elif pre == 'r':
        import h2o
        from h2o.estimators.gbm import H2OGradientBoostingEstimator
        from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
        h2o.init()
        hf = h2o.import_file("electron-1.parquet")
        pairs = []
        for i in hf.columns:
            for j in hf.columns:
                if (i.startswith('Z') and j.startswith('L')):
                    if ((i!='X0') and (j != 'Y0')):
                        pairs.append((i,j))
        predictors = hf.columns[2:]
        response_col = hf.columns[0]
        
        print(predictors)
        print(response_col)
        print(pairs)
        if mode == 'PE':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                lambda_ = 0,
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        
        elif mode == 'combined':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                interaction_pairs=pairs,
                lambda_ = 0,
                #remove_collinear_columns = True, 
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        breakpoint()
        coef_table = glm_model._model_json['output']['coefficients_table']
        coef_ = coef_table['coefficients']
        std = coef_table['std_error']
        AIC = glm_model.aic()
        print(f'Regession coef is f{np.array(coef_)}')             
        if (figure=='ON'):
            import matplotlib.pyplot as plt
            L, K = 500, 500
            ddx = np.linspace(-1.0, 1.0, K)
            ddy = np.linspace(-1.0, 1.0, L)
            xv, yv = np.meshgrid(ddx, ddy)
            cart.make_cart_grid(xv, yv)
            # normal scale
            # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1))
            # log scale
            im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1))
            plt.colorbar()
            plt.savefig('test.png')
    else:
        print('error regression algorithm')
            
    with h5py.File(output,'w') as out:        
        out.create_dataset('coeff' + str(order), data = coef_)
        out.create_dataset('std' + str(order), data = std)
        out.create_dataset('AIC' + str(order), data = AIC)
def tweedie_regression():
    reg = TweedieRegressor(power=1, alpha=0.5, link='log')
    reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
    print(reg.coef_)
    print(reg.intercept_)
Exemple #16
0
print('R2 ={}'.format(R2))


#generalized not orking
from sklearn.linear_model import TweedieRegressor
list=[]
for i in np.arange(5,20):
    dfcorr=df[correlatedvar[:i]]
    from sklearn.preprocessing import MinMaxScaler
    scaler=MinMaxScaler(feature_range=(1,10))
    dfscal=scaler.fit_transform(dfcorr)
    Y=dfscal[:,0]
    X=dfscal[:,1:]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25,shuffle=False)
    regr=TweedieRegressor(power=1, alpha=0.5, link='log')
    regr.fit(X_train, y_train)
    prediction=regr.predict(X_test)
    R2=sklearn.metrics.r2_score(y_test,prediction)
    list.append(R2)
print('optimal amount of variables: {}, R2=' .format(list.index(max(list))+5),R2) #max in 12


#polynomial
from sklearn.preprocessing import PolynomialFeatures
list=[]
for i in np.arange(2,10):
    dfcorr=df[correlatedvar[:i]]
    from sklearn.preprocessing import MinMaxScaler
    scaler=MinMaxScaler(feature_range=(1,10))
    dfscal=scaler.fit_transform(dfcorr)
    Y=dfscal[:,0]