Ejemplo n.º 1
0
def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False,
                       n_boot: int=1):

    # alphas = np.append(np.logspace(.1, 1.5, num=25), [250])
    alphas = np.logspace(.1, 1.5, num=25)
    kwrgs_model = {'scoring':'neg_mean_absolute_error',
                   'alphas':alphas, # large a, strong regul.
                   'normalize':False}

    if target_ts is None:
        fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False)
        target_ts = df_data.iloc[:,[0]].loc[0][fc_mask]

    else:
        target_ts = target_ts
    target_ts = (target_ts - target_ts.mean()) / target_ts.std()
    out = rg.fit_df_data_ridge(df_data=df_data,
                               target=target_ts,
                               keys=keys,
                               tau_min=min(lags), tau_max=max(lags),
                               kwrgs_model=kwrgs_model,
                               match_lag_region_to_lag_fc=match_lag,
                               transformer=fc_utils.standardize_on_train)

    prediction, weights, models_lags = out
    # get skill scores
    clim_mean_temp = float(target_ts.mean())
    RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE
    MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE
    score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS]

    df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction,
                                                             df_data.iloc[:,-2:],
                                                             score_func_list,
                                                             n_boot = n_boot,
                                                             blocksize=blocksize,
                                                             rng_seed=1)
    index = np.unique(core_pp.flatten([k.split('_') for k in  keys]))
    AR = [l for l in index if '..' not in l]
    AR = [l for l in AR if 'PDO' not in l]
    index = [k for k in index if k not in AR]
    df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)]
    n_splits = df_data.index.levels[0].size # test for high alpha
    for col in df_test_m.columns.levels[0]:
        cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)]
        print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha)))
        maxalpha_c = list(cvfitalpha).count(alphas[-1])
        if maxalpha_c > n_splits/3:
            print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}')
            print(f'{maxalpha_c} splits are max alpha\n')
            # maximum regularization selected. No information in timeseries
            # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0
            # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0
            no_info_fc.append(col)
    df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:],
                                                    left_index=True,
                                                    right_index=True)).iloc[:,:-2]
    return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
Ejemplo n.º 2
0
def prediction_wrapper(q):
    fcmodel = ScikitModel(scikitmodel=LogisticRegressionCV).fit
    kwrgs_model = {
        'class_weight': {
            0: 1,
            1: 1
        },
        'scoring': 'neg_brier_score',
        'penalty': 'l2',
        'solver': 'lbfgs'
    }

    lag = 4
    keys = ['0..PEPsv']  #rg.df_data.columns[2:-2]
    keys = [k for k in rg.df_data.columns[2:-2] if 'sst' in k]
    target_ts = rg.TV_ts  # - rg.TV_ts.mean()) / rg.TV_ts.std()
    # target_ts = rg.df_data_ext.loc[0][['mx2t']][rg.df_data.loc[0]['RV_mask']]
    target_ts = target_ts.to_dataframe('target')[['target']]
    target_ts.index.name = None
    target_ts = (target_ts > target_ts.quantile(q=q)).astype(int)
    out = rg.fit_df_data_ridge(target=target_ts,
                               fcmodel=fcmodel,
                               keys=keys,
                               tau_min=0,
                               tau_max=lag,
                               kwrgs_model=kwrgs_model)

    prediction, weights, models_lags = out

    df_test = functions_pp.get_df_test(
        prediction.merge(rg.df_data.iloc[:, -2:].copy(),
                         left_index=True,
                         right_index=True)).iloc[:, :-2]

    # get skill scores
    clim_mean_temp = float(target_ts.mean())
    SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp)
    BSS = SS.BSS
    score_func_list = [
        metrics.roc_auc_score, BSS,
        fc_utils.ErrorSkillScore().AUC_SS
    ]

    df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(
        prediction,
        rg.df_data.iloc[:, -2:],
        score_func_list,
        score_per_test=False,
        n_boot=0,
        blocksize=2,
        rng_seed=1)
    return df_train_m, df_test_m, df_boot, df_test, models_lags, SS
    keys=y_keys,
    standardize=False)
fig_path = os.path.join(rg.path_outsub1,
                        f'{sst._name}_r{lowpass}PDO_{period}_{append_str}')
fig.savefig(fig_path + rg.figext, bbox_inches='tight')
df_data_r2PDO.loc[0][keys].corrwith(rg.df_data.loc[0][keys]).plot(kind='bar')

# =============================================================================
# Predictions
# =============================================================================
# out_regr2PDO = prediction_wrapper(df_data_r2PDO.copy(), keys=keys,
#                                  match_lag=match_lag, n_boot=n_boot)
dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980, 2020))
target_ts_temp = rg.TV.RV_ts.loc[dates]
clim_mean_temp = float(target_ts_temp.mean())
RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE
MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE
score_func_list = [
    RMSE_SS, fc_utils.corrcoef, MAE_SS, fc_utils.metrics.mean_absolute_error
]
# predictions temp using SST regions
df_prec = merge_lagged_wrapper(rg.df_data.copy(), [1, 2], keys)

df_prec = df_prec.loc[pd.IndexSlice[:, dates], :]
df_prec = df_prec.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :],
                        left_index=True,
                        right_index=True)
out = prediction_wrapper(df_prec,
                         lags=np.array([0]),
                         target_ts=target_ts_temp,
                         keys=None,
                    'kfold':5}
elif prediction == 'events':
    model = ScikitModel(LogisticRegressionCV, verbosity=0)
    kwrgs_model = {'kfold':5,
                    'scoring':'neg_brier_score'}


# target
target_ts = rg.TV.RV_ts.copy()
target_ts = (target_ts - target_ts.mean()) / target_ts.std()
if prediction == 'events':
    if q >= 0.5:
        target_ts = (target_ts > target_ts.quantile(q)).astype(int)
    elif q < .5:
        target_ts = (target_ts < target_ts.quantile(q)).astype(int)
    BSS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).BSS
    score_func_list = [BSS, fc_utils.metrics.roc_auc_score]

elif prediction == 'continuous':
    RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).RMSE
    MAE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).MAE
    score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS]


out = rg.fit_df_data_ridge(target=target_ts,
                            keys=None,
                            fcmodel=model,
                            kwrgs_model=kwrgs_model,
                            transformer=False,
                            tau_min=1, tau_max=2)
predict, weights, model_lags = out
Ejemplo n.º 5
0
def forecast(rg, crossyr):

    # Forecasting pipeline 1

    import func_models as fc_utils
    from stat_models_cont import ScikitModel
    import numpy as np
    from sklearn.linear_model import Ridge
    from sklearn.linear_model import LogisticRegressionCV

    # choose type prediciton (continuous or probabilistic) by making comment #
    prediction = 'continuous'
    # prediction = 'events' ; q = .66 # quantile threshold for event definition

    if prediction == 'continuous':
        model = ScikitModel(Ridge, verbosity=0)
        # You can also tune parameters by passing a list of values. Then GridSearchCV from sklearn will
        # find the set of parameters that give the best mean score on all kfold test sets.
        # below we pass a list of alpha's to tune the regularization.
        alphas = list(
            np.concatenate([[1E-20],
                            np.logspace(-5, 0, 6),
                            np.logspace(.01, 2.5, num=25)]))
        kwrgs_model = {
            'scoringCV': 'neg_mean_absolute_error',
            'kfold': 10,
            'alpha': alphas
        }  # large a, strong regul.
    elif prediction == 'events':
        model = ScikitModel(LogisticRegressionCV, verbosity=0)
        kwrgs_model = {'kfold': 5, 'scoring': 'neg_brier_score'}

    target_ts = rg.TV.RV_ts
    target_ts = (target_ts - target_ts.mean()) / target_ts.std()

    if prediction == 'events':
        q = 0.66
        if q >= 0.5:
            target_ts = (target_ts > target_ts.quantile(q)).astype(int)
        elif q < .5:
            target_ts = (target_ts < target_ts.quantile(q)).astype(int)
        BSS = fc_utils.ErrorSkillScore(
            constant_bench=float(target_ts.mean())).BSS
        score_func_list = [BSS, fc_utils.metrics.roc_auc_score]

    elif prediction == 'continuous':
        RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float(
            target_ts.mean())).RMSE  #RMSE ERROR SKILL SCORE
        MAE_SS = fc_utils.ErrorSkillScore(
            constant_bench=float(target_ts.mean())).MAE  #MAE ERROR SKILL SCORE
        score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS]

    keys = [k for k in rg.df_data.columns[1:-2]]
    out = rg.fit_df_data_ridge(target=target_ts,
                               keys=keys,
                               fcmodel=model,
                               kwrgs_model=kwrgs_model,
                               transformer=None,
                               tau_min=0,
                               tau_max=0)  # <- lag should be zero
    predict, weights, model_lags = out

    df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(
        predict,
        rg.df_data.iloc[:, -2:],
        score_func_list,
        n_boot=100,  #intensive
        score_per_test=False,
        blocksize=1,
        rng_seed=1)
    lag = 0
    if prediction == 'events':
        print(
            model.scikitmodel.__name__, '\n', f'Test score at lag {lag}\n',
            'BSS {:.2f}\n'.format(df_test_m.loc[0].loc[0].loc['BSS']),
            'AUC {:.2f}'.format(df_test_m.loc[0].loc[0].loc['roc_auc_score']),
            '\nTrain score\n',
            'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['BSS']),
            'AUC {:.2f}'.format(df_train_m.mean(0).loc[0]['roc_auc_score']))
    elif prediction == 'continuous':
        print(model.scikitmodel.__name__, '\n', 'Test score\n',
              'RMSE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['RMSE']),
              'MAE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['MAE']),
              'corrcoef {:.2f}'.format(df_test_m.loc[0][0]['corrcoef']),
              '\nTrain score\n',
              'RMSE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['RMSE']),
              'MAE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['MAE']),
              'corrcoef {:.2f}'.format(df_train_m.mean(0).loc[0]['corrcoef']))

    test_scores = [
        df_test_m.loc[0][0]['RMSE'], df_test_m.loc[0][0]['MAE'],
        df_test_m.loc[0][0]['corrcoef']
    ]
    train_scores = [
        df_train_m.mean(0).loc[0]['RMSE'],
        df_train_m.mean(0).loc[0]['MAE'],
        df_train_m.mean(0).loc[0]['corrcoef']
    ]

    return test_scores, train_scores, predict
Ejemplo n.º 6
0
rg.cluster_list_MI()
rg.get_ts_prec()
#%%
fc_mask = rg.df_data.iloc[:, -1].loc[0]  #.shift(lag, fill_value=False)
# rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask))
target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask]
target_ts = (target_ts - target_ts.mean()) / target_ts.std()
alphas = np.append(np.logspace(.1, 1.5, num=25), [250])
kwrgs_model = {
    'scoring': 'neg_mean_squared_error',
    'alphas': alphas,  # large a, strong regul.
    'normalize': False
}

keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']]
keys = [k for k in keys if int(k.split('..')[0]) in [2]]
# keys = [k for k in keys if int(k.split('..')[1]) in [1,3]]

out_fit = rg.fit_df_data_ridge(target=target_ts,
                               tau_min=2,
                               tau_max=2,
                               keys=keys,
                               kwrgs_model=kwrgs_model)
predict, weights, models_lags = out_fit
df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(
    predict,
    score_func_list=[fc_utils.corrcoef,
                     fc_utils.ErrorSkillScore(0).RMSE])
print(df_test_m)

# rg.store_df(append_str='z500_'+'-'.join(map(str, z500_green_bb))+TV+str(cluster_label))
Ejemplo n.º 7
0
import func_models as fc_utils

sst = rg.list_for_MI[0]
sst.calc_ts = 'region mean'
rg.cluster_list_MI()
rg.get_ts_prec()
#%%
fc_mask = rg.df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False)
# rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask))
target_ts = rg.df_data.iloc[:,[0]].loc[0][fc_mask]
target_ts = (target_ts - target_ts.mean()) / target_ts.std()
alphas = np.append(np.logspace(.1, 1.5, num=25), [250])
kwrgs_model = {'scoring':'neg_mean_squared_error',
               'alphas':alphas, # large a, strong regul.
               'normalize':False}

keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']]
keys = [k for k in keys if int(k.split('..')[0]) in [2]]
# keys = [k for k in keys if int(k.split('..')[1]) in [1,3]]

out_fit = rg.fit_df_data_ridge(target=target_ts,tau_min=2, tau_max=2,
                               keys=keys,
                               kwrgs_model=kwrgs_model)
predict, weights, models_lags = out_fit
df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(predict,
                                                                  score_func_list=[fc_utils.corrcoef, fc_utils.ErrorSkillScore(0).RMSE])
print(df_test_m)



    # rg.store_df(append_str='z500_'+'-'.join(map(str, z500_green_bb))+TV+str(cluster_label))