def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False, n_boot: int=1): # alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) alphas = np.logspace(.1, 1.5, num=25) kwrgs_model = {'scoring':'neg_mean_absolute_error', 'alphas':alphas, # large a, strong regul. 'normalize':False} if target_ts is None: fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False) target_ts = df_data.iloc[:,[0]].loc[0][fc_mask] else: target_ts = target_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() out = rg.fit_df_data_ridge(df_data=df_data, target=target_ts, keys=keys, tau_min=min(lags), tau_max=max(lags), kwrgs_model=kwrgs_model, match_lag_region_to_lag_fc=match_lag, transformer=fc_utils.standardize_on_train) prediction, weights, models_lags = out # get skill scores clim_mean_temp = float(target_ts.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction, df_data.iloc[:,-2:], score_func_list, n_boot = n_boot, blocksize=blocksize, rng_seed=1) index = np.unique(core_pp.flatten([k.split('_') for k in keys])) AR = [l for l in index if '..' not in l] AR = [l for l in AR if 'PDO' not in l] index = [k for k in index if k not in AR] df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)] n_splits = df_data.index.levels[0].size # test for high alpha for col in df_test_m.columns.levels[0]: cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)] print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha))) maxalpha_c = list(cvfitalpha).count(alphas[-1]) if maxalpha_c > n_splits/3: print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}') print(f'{maxalpha_c} splits are max alpha\n') # maximum regularization selected. No information in timeseries # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 no_info_fc.append(col) df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:], left_index=True, right_index=True)).iloc[:,:-2] return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
def prediction_wrapper(q): fcmodel = ScikitModel(scikitmodel=LogisticRegressionCV).fit kwrgs_model = { 'class_weight': { 0: 1, 1: 1 }, 'scoring': 'neg_brier_score', 'penalty': 'l2', 'solver': 'lbfgs' } lag = 4 keys = ['0..PEPsv'] #rg.df_data.columns[2:-2] keys = [k for k in rg.df_data.columns[2:-2] if 'sst' in k] target_ts = rg.TV_ts # - rg.TV_ts.mean()) / rg.TV_ts.std() # target_ts = rg.df_data_ext.loc[0][['mx2t']][rg.df_data.loc[0]['RV_mask']] target_ts = target_ts.to_dataframe('target')[['target']] target_ts.index.name = None target_ts = (target_ts > target_ts.quantile(q=q)).astype(int) out = rg.fit_df_data_ridge(target=target_ts, fcmodel=fcmodel, keys=keys, tau_min=0, tau_max=lag, kwrgs_model=kwrgs_model) prediction, weights, models_lags = out df_test = functions_pp.get_df_test( prediction.merge(rg.df_data.iloc[:, -2:].copy(), left_index=True, right_index=True)).iloc[:, :-2] # get skill scores clim_mean_temp = float(target_ts.mean()) SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp) BSS = SS.BSS score_func_list = [ metrics.roc_auc_score, BSS, fc_utils.ErrorSkillScore().AUC_SS ] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( prediction, rg.df_data.iloc[:, -2:], score_func_list, score_per_test=False, n_boot=0, blocksize=2, rng_seed=1) return df_train_m, df_test_m, df_boot, df_test, models_lags, SS
keys=y_keys, standardize=False) fig_path = os.path.join(rg.path_outsub1, f'{sst._name}_r{lowpass}PDO_{period}_{append_str}') fig.savefig(fig_path + rg.figext, bbox_inches='tight') df_data_r2PDO.loc[0][keys].corrwith(rg.df_data.loc[0][keys]).plot(kind='bar') # ============================================================================= # Predictions # ============================================================================= # out_regr2PDO = prediction_wrapper(df_data_r2PDO.copy(), keys=keys, # match_lag=match_lag, n_boot=n_boot) dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980, 2020)) target_ts_temp = rg.TV.RV_ts.loc[dates] clim_mean_temp = float(target_ts_temp.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [ RMSE_SS, fc_utils.corrcoef, MAE_SS, fc_utils.metrics.mean_absolute_error ] # predictions temp using SST regions df_prec = merge_lagged_wrapper(rg.df_data.copy(), [1, 2], keys) df_prec = df_prec.loc[pd.IndexSlice[:, dates], :] df_prec = df_prec.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :], left_index=True, right_index=True) out = prediction_wrapper(df_prec, lags=np.array([0]), target_ts=target_ts_temp, keys=None,
'kfold':5} elif prediction == 'events': model = ScikitModel(LogisticRegressionCV, verbosity=0) kwrgs_model = {'kfold':5, 'scoring':'neg_brier_score'} # target target_ts = rg.TV.RV_ts.copy() target_ts = (target_ts - target_ts.mean()) / target_ts.std() if prediction == 'events': if q >= 0.5: target_ts = (target_ts > target_ts.quantile(q)).astype(int) elif q < .5: target_ts = (target_ts < target_ts.quantile(q)).astype(int) BSS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).BSS score_func_list = [BSS, fc_utils.metrics.roc_auc_score] elif prediction == 'continuous': RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] out = rg.fit_df_data_ridge(target=target_ts, keys=None, fcmodel=model, kwrgs_model=kwrgs_model, transformer=False, tau_min=1, tau_max=2) predict, weights, model_lags = out
def forecast(rg, crossyr): # Forecasting pipeline 1 import func_models as fc_utils from stat_models_cont import ScikitModel import numpy as np from sklearn.linear_model import Ridge from sklearn.linear_model import LogisticRegressionCV # choose type prediciton (continuous or probabilistic) by making comment # prediction = 'continuous' # prediction = 'events' ; q = .66 # quantile threshold for event definition if prediction == 'continuous': model = ScikitModel(Ridge, verbosity=0) # You can also tune parameters by passing a list of values. Then GridSearchCV from sklearn will # find the set of parameters that give the best mean score on all kfold test sets. # below we pass a list of alpha's to tune the regularization. alphas = list( np.concatenate([[1E-20], np.logspace(-5, 0, 6), np.logspace(.01, 2.5, num=25)])) kwrgs_model = { 'scoringCV': 'neg_mean_absolute_error', 'kfold': 10, 'alpha': alphas } # large a, strong regul. elif prediction == 'events': model = ScikitModel(LogisticRegressionCV, verbosity=0) kwrgs_model = {'kfold': 5, 'scoring': 'neg_brier_score'} target_ts = rg.TV.RV_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() if prediction == 'events': q = 0.66 if q >= 0.5: target_ts = (target_ts > target_ts.quantile(q)).astype(int) elif q < .5: target_ts = (target_ts < target_ts.quantile(q)).astype(int) BSS = fc_utils.ErrorSkillScore( constant_bench=float(target_ts.mean())).BSS score_func_list = [BSS, fc_utils.metrics.roc_auc_score] elif prediction == 'continuous': RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float( target_ts.mean())).RMSE #RMSE ERROR SKILL SCORE MAE_SS = fc_utils.ErrorSkillScore( constant_bench=float(target_ts.mean())).MAE #MAE ERROR SKILL SCORE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] keys = [k for k in rg.df_data.columns[1:-2]] out = rg.fit_df_data_ridge(target=target_ts, keys=keys, fcmodel=model, kwrgs_model=kwrgs_model, transformer=None, tau_min=0, tau_max=0) # <- lag should be zero predict, weights, model_lags = out df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( predict, rg.df_data.iloc[:, -2:], score_func_list, n_boot=100, #intensive score_per_test=False, blocksize=1, rng_seed=1) lag = 0 if prediction == 'events': print( model.scikitmodel.__name__, '\n', f'Test score at lag {lag}\n', 'BSS {:.2f}\n'.format(df_test_m.loc[0].loc[0].loc['BSS']), 'AUC {:.2f}'.format(df_test_m.loc[0].loc[0].loc['roc_auc_score']), '\nTrain score\n', 'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['BSS']), 'AUC {:.2f}'.format(df_train_m.mean(0).loc[0]['roc_auc_score'])) elif prediction == 'continuous': print(model.scikitmodel.__name__, '\n', 'Test score\n', 'RMSE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['RMSE']), 'MAE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['MAE']), 'corrcoef {:.2f}'.format(df_test_m.loc[0][0]['corrcoef']), '\nTrain score\n', 'RMSE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['RMSE']), 'MAE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['MAE']), 'corrcoef {:.2f}'.format(df_train_m.mean(0).loc[0]['corrcoef'])) test_scores = [ df_test_m.loc[0][0]['RMSE'], df_test_m.loc[0][0]['MAE'], df_test_m.loc[0][0]['corrcoef'] ] train_scores = [ df_train_m.mean(0).loc[0]['RMSE'], df_train_m.mean(0).loc[0]['MAE'], df_train_m.mean(0).loc[0]['corrcoef'] ] return test_scores, train_scores, predict
rg.cluster_list_MI() rg.get_ts_prec() #%% fc_mask = rg.df_data.iloc[:, -1].loc[0] #.shift(lag, fill_value=False) # rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask)) target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask] target_ts = (target_ts - target_ts.mean()) / target_ts.std() alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) kwrgs_model = { 'scoring': 'neg_mean_squared_error', 'alphas': alphas, # large a, strong regul. 'normalize': False } keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']] keys = [k for k in keys if int(k.split('..')[0]) in [2]] # keys = [k for k in keys if int(k.split('..')[1]) in [1,3]] out_fit = rg.fit_df_data_ridge(target=target_ts, tau_min=2, tau_max=2, keys=keys, kwrgs_model=kwrgs_model) predict, weights, models_lags = out_fit df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( predict, score_func_list=[fc_utils.corrcoef, fc_utils.ErrorSkillScore(0).RMSE]) print(df_test_m) # rg.store_df(append_str='z500_'+'-'.join(map(str, z500_green_bb))+TV+str(cluster_label))
import func_models as fc_utils sst = rg.list_for_MI[0] sst.calc_ts = 'region mean' rg.cluster_list_MI() rg.get_ts_prec() #%% fc_mask = rg.df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False) # rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask)) target_ts = rg.df_data.iloc[:,[0]].loc[0][fc_mask] target_ts = (target_ts - target_ts.mean()) / target_ts.std() alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) kwrgs_model = {'scoring':'neg_mean_squared_error', 'alphas':alphas, # large a, strong regul. 'normalize':False} keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']] keys = [k for k in keys if int(k.split('..')[0]) in [2]] # keys = [k for k in keys if int(k.split('..')[1]) in [1,3]] out_fit = rg.fit_df_data_ridge(target=target_ts,tau_min=2, tau_max=2, keys=keys, kwrgs_model=kwrgs_model) predict, weights, models_lags = out_fit df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(predict, score_func_list=[fc_utils.corrcoef, fc_utils.ErrorSkillScore(0).RMSE]) print(df_test_m) # rg.store_df(append_str='z500_'+'-'.join(map(str, z500_green_bb))+TV+str(cluster_label))