Ejemplo n.º 1
0
def glm_model(X_tr, y_tr, X_v, y_v, X_te, y_te, d_str, **kwargs):
    '''
    Generalized Linear Model with a Tweedie distribution.
    This estimator can be used to model different GLMs depending 
    on the power parameter, which determines the underlying distribution.
    '''
    # create the model object
    glm = TweedieRegressor(**kwargs)

    # fit the model to our training data
    glm.fit(X_tr, y_tr)

    # predict on train
    glm_pred = glm.predict(X_tr)
    # compute root mean squared error
    glm_rmse = sqrt(mean_squared_error(y_tr, glm_pred))

    # predict on validate
    glm_pred_v = glm.predict(X_v)
    # compute root mean squared error
    glm_rmse_v = sqrt(mean_squared_error(y_v, glm_pred_v))

    # predict on test
    glm_pred_t = glm.predict(X_te)
    # compute root mean squared error
    glm_rmse_t = sqrt(mean_squared_error(y_te, glm_pred_t))
    print(f'RMSE for GLM using {d_str} Distribution \n')
    print('On train data:\n', round(glm_rmse, 6), '\n')
    # print(glm_rmse_v)
    return glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t
Ejemplo n.º 2
0
def test_tweedie_score(regression_data, power, link):
    """Test that GLM score equals d2_tweedie_score for Tweedie losses."""
    X, y = regression_data
    # make y positive
    y = np.abs(y) + 1.0
    glm = TweedieRegressor(power=power, link=link).fit(X, y)
    assert glm.score(X, y) == pytest.approx(
        d2_tweedie_score(y, glm.predict(X), power=power))
Ejemplo n.º 3
0
def regression(linkfunc, x, y, test):

    reg = TweedieRegressor(power=POWER, alpha=ALPHA, link=linkfunc)

    # reshaping when there is only 1 feature (= dependent variable)
    if len(x.shape) == 1:
        x = x.reshape(-1, 1) # ! convert to "column" vector
                             #   as data should be in rows
        test = test.reshape(-1, 1)

    variable_cnt = x.shape[1]

    plur = "s" if variable_cnt > 1 else ""
    print()
    print('generalized linear regression parametrized as')
    print(f' -- link = \'{linkfunc}\'')
    print(f' -- {variable_cnt} dependent variable{plur}')
    print(reg)

    print()
    print(f'train: {x} -> {y}')
    print(f'test: {test} -> ???')

    reg.fit(x, y)

    predicted = reg.predict(test)

    print()
    print('predicted:')
    print(predicted)

    print()
    print('y = reg.coef_ * x + reg.intercept_')
    print(f'reg.coef_ = {reg.coef_}')
    print(f'reg.intercept_ = {reg.intercept_:.2f}')
    for t in test:
        x_val = t
        y_val = reg.coef_ * t + reg.intercept_
        print(f'{x_val} -> {y_val}')

    strs = []
    if variable_cnt > 1:
        strs.append('sum')
    if linkfunc != 'identity':
        strs.append(f'inverse of link function \'{linkfunc}\'')
    basic_str = 'to be applied!'

    if len(strs) > 0:
        print(' and '.join(strs), basic_str)

    print()
    print(f'n_iter_ = {reg.n_iter_}')

    print()
Ejemplo n.º 4
0
def tweedie(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_train, tw_pred))
    return tw_rmse
Ejemplo n.º 5
0
def tweedie_test(X_train, y_train, X_test, y_test, pwr, alf):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=pwr, alpha=alf)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_test)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_test, tw_pred)
    return tw_MAE, tw, tw_pred
Ejemplo n.º 6
0
def tweedie05(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.5)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_train, tw_pred)
    return tw_MAE
Ejemplo n.º 7
0
def tweedie_vt(X_train_scaled, X_validate_scaled, y_train, y_validate):
    '''
    runs tweedie algorithm on validate and test
    but fits model on train
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=0.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_validate_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_validate, tw_pred))
    return tw_rmse
def sk_tweedie_regression(X_train,
                          X_test,
                          y_train,
                          y_test,
                          set_model='linear'):
    if set_model == 'Poisson':
        reg = TweedieRegressor(
            alpha=0,
            power=1,  # Poisson distribution
            link='log',
            fit_intercept=False,
            max_iter=300)
    elif set_model == 'linear':
        reg = TweedieRegressor(
            alpha=0,
            power=0,  # Normal distribution
            link='identity',
            fit_intercept=False,
            max_iter=300)
    else:
        print('Set the correct name.')
        return

    reg.fit(X_train, y_train)
    print('score: ', reg.score(X_test, y_test))

    y_hat = reg.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
Ejemplo n.º 9
0
    # print(gks_test)

    gks_x = gks.iloc[:, :-1].values
    gks_y = gks.iloc[:, -1].values

    gks_x_test = gks_test.iloc[:, :-1].values
    gks_y_test = gks_test.iloc[:, -1].values

    scaler = StandardScaler()

    gks_x = scaler.fit_transform(gks_x)

    # reg = SVR(C=10, epsilon=0.2)

    reg = TweedieRegressor(power=1, alpha=0.5, link='log')

    reg.fit(gks_x, gks_y)

    gks_x_test = scaler.transform(gks_x_test)
    preds = reg.predict(gks_x_test)

    print(mean_squared_error(gks_y_test, preds))

    # print(gks_test_names)

    with open('gks.csv', 'w') as file:
        for idx, val in enumerate(preds):
            file.write(gks_test_names.iloc[idx]['web_name'] + "," + str(val) +
                       "," + str(gks_y_test[idx]))
            file.write('\n')
Ejemplo n.º 10
0
def all_models_info():
    '''takes in data
    sets baseline
    sets SSE, MSE, and RMSE
    returns infor for all 4'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    #LARS Model
    lars = LassoLars(alpha=1.0)
    lars.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lars'] = lars.predict(X_train)
    rmse_train_lars = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2
    y_validate['appraised_value_pred_lars'] = lars.predict(X_validate)
    rmse_validate_lars = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lars)**1 / 2
    #GLM
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_glm'] = glm.predict(X_train)
    rmse_train_glm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2
    y_validate['appraised_value_pred_glm'] = glm.predict(X_validate)
    rmse_validate_glm = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2
    # PF
    pf = PolynomialFeatures(degree=2)
    X_train_degree2 = pf.fit_transform(X_train)
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # LM2
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train.appraised_value)
    y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2)
    rmse_train_lm2 = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2
    y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2)
    rmse_validate_lm2 = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
    print("--------------------------------------------------------------")
    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars,
          "\nValidation/Out-of-Sample: ", rmse_validate_lars)
    print("--------------------------------------------------------------")
    print(
        "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ",
        rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm)
    print("--------------------------------------------------------------")
    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ",
          rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
res = []
for subset_label, X, df in [
    ("train", X_train, df_train),
    ("test", X_test, df_test),
]:
    exposure = df["Exposure"].values
    res.append({
        "subset":
        subset_label,
        "observed":
        df["ClaimAmount"].values.sum(),
        "predicted, frequency*severity model":
        np.sum(exposure * glm_freq.predict(X) * glm_sev.predict(X)),
        "predicted, tweedie, power=%.2f" % glm_pure_premium.power:
        np.sum(exposure * glm_pure_premium.predict(X)),
    })

print(pd.DataFrame(res).set_index("subset").T)

# %%
# Finally, we can compare the two models using a plot of cumulated claims: for
# each model, the policyholders are ranked from safest to riskiest and the
# fraction of observed total cumulated claims is plotted on the y axis. This
# plot is often called the ordered Lorenz curve of the model.
#
# The Gini coefficient (based on the area under the curve) can be used as a
# model selection metric to quantify the ability of the model to rank
# policyholders. Note that this metric does not reflect the ability of the
# models to make accurate predictions in terms of absolute value of total
# claim amounts but only in terms of relative amounts as a ranking metric.
Ejemplo n.º 12
0
# predict eco data for given year and month
df_future = pd.DataFrame(columns=['Date'])
for i, eco_var in enumerate(list(eco_vec_map.keys())):
    print("Forecasting " + eco_var + ' ' + str(Y) + ' ' +
          datetime.strptime(str(M), "%m").strftime("%b"))
    tmp = forecast_eco(df_eco, eco_var, Y, M)
    tmp = tmp[['ds', 'trend']]
    tmp.rename(columns={'ds': 'Date', 'trend': eco_var}, inplace=True)
    df_future = df_future.merge(tmp, on='Date', how='right')

# predict transaction count using the glm model
eco_forecast = df_future.tail(1)[[
    'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX'
]]
transaction_count_forecast = glm.predict(
    scaler.transform(eco_forecast)).astype(int)[0]

# load synthesizer from saved object
with open(synthesizer_file, 'rb') as input:
    synthesizer = CPU_Unpickler(input).load()
synthesizer.device = 'cpu'

# generate
print('Generating synthesized data with %i samples......' %
      transaction_count_forecast)
sample = synthesizer.sample(transaction_count_forecast)

# load column names for synthesized data
df_input = pd.read_csv(data_input_file)
input_columns = list(df_input.columns)[1:]
df_sample = pd.DataFrame(sample, columns=input_columns)
Ejemplo n.º 13
0
 def tweedieregressor(self,X_train,X_test,y_train,y_test):
     
     regressor= TweedieRegressor()
     regfit=regressor.fit(self.X_train,self.y_train)
     return regressor.predict(self.X_test)
Ejemplo n.º 14
0

#generalized not orking
from sklearn.linear_model import TweedieRegressor
list=[]
for i in np.arange(5,20):
    dfcorr=df[correlatedvar[:i]]
    from sklearn.preprocessing import MinMaxScaler
    scaler=MinMaxScaler(feature_range=(1,10))
    dfscal=scaler.fit_transform(dfcorr)
    Y=dfscal[:,0]
    X=dfscal[:,1:]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25,shuffle=False)
    regr=TweedieRegressor(power=1, alpha=0.5, link='log')
    regr.fit(X_train, y_train)
    prediction=regr.predict(X_test)
    R2=sklearn.metrics.r2_score(y_test,prediction)
    list.append(R2)
print('optimal amount of variables: {}, R2=' .format(list.index(max(list))+5),R2) #max in 12


#polynomial
from sklearn.preprocessing import PolynomialFeatures
list=[]
for i in np.arange(2,10):
    dfcorr=df[correlatedvar[:i]]
    from sklearn.preprocessing import MinMaxScaler
    scaler=MinMaxScaler(feature_range=(1,10))
    dfscal=scaler.fit_transform(dfcorr)
    Y=dfscal[:,0]
    X=dfscal[:,1:]