def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False, n_boot: int=1): # alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) alphas = np.logspace(.1, 1.5, num=25) kwrgs_model = {'scoring':'neg_mean_absolute_error', 'alphas':alphas, # large a, strong regul. 'normalize':False} if target_ts is None: fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False) target_ts = df_data.iloc[:,[0]].loc[0][fc_mask] else: target_ts = target_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() out = rg.fit_df_data_ridge(df_data=df_data, target=target_ts, keys=keys, tau_min=min(lags), tau_max=max(lags), kwrgs_model=kwrgs_model, match_lag_region_to_lag_fc=match_lag, transformer=fc_utils.standardize_on_train) prediction, weights, models_lags = out # get skill scores clim_mean_temp = float(target_ts.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction, df_data.iloc[:,-2:], score_func_list, n_boot = n_boot, blocksize=blocksize, rng_seed=1) index = np.unique(core_pp.flatten([k.split('_') for k in keys])) AR = [l for l in index if '..' not in l] AR = [l for l in AR if 'PDO' not in l] index = [k for k in index if k not in AR] df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)] n_splits = df_data.index.levels[0].size # test for high alpha for col in df_test_m.columns.levels[0]: cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)] print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha))) maxalpha_c = list(cvfitalpha).count(alphas[-1]) if maxalpha_c > n_splits/3: print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}') print(f'{maxalpha_c} splits are max alpha\n') # maximum regularization selected. No information in timeseries # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 no_info_fc.append(col) df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:], left_index=True, right_index=True)).iloc[:,:-2] return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
def prediction_wrapper(q): fcmodel = ScikitModel(scikitmodel=LogisticRegressionCV).fit kwrgs_model = { 'class_weight': { 0: 1, 1: 1 }, 'scoring': 'neg_brier_score', 'penalty': 'l2', 'solver': 'lbfgs' } lag = 4 keys = ['0..PEPsv'] #rg.df_data.columns[2:-2] keys = [k for k in rg.df_data.columns[2:-2] if 'sst' in k] target_ts = rg.TV_ts # - rg.TV_ts.mean()) / rg.TV_ts.std() # target_ts = rg.df_data_ext.loc[0][['mx2t']][rg.df_data.loc[0]['RV_mask']] target_ts = target_ts.to_dataframe('target')[['target']] target_ts.index.name = None target_ts = (target_ts > target_ts.quantile(q=q)).astype(int) out = rg.fit_df_data_ridge(target=target_ts, fcmodel=fcmodel, keys=keys, tau_min=0, tau_max=lag, kwrgs_model=kwrgs_model) prediction, weights, models_lags = out df_test = functions_pp.get_df_test( prediction.merge(rg.df_data.iloc[:, -2:].copy(), left_index=True, right_index=True)).iloc[:, :-2] # get skill scores clim_mean_temp = float(target_ts.mean()) SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp) BSS = SS.BSS score_func_list = [ metrics.roc_auc_score, BSS, fc_utils.ErrorSkillScore().AUC_SS ] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( prediction, rg.df_data.iloc[:, -2:], score_func_list, score_per_test=False, n_boot=0, blocksize=2, rng_seed=1) return df_train_m, df_test_m, df_boot, df_test, models_lags, SS
tau_min=0, tau_max=0, kwrgs_model={'alphas':np.array([.01,.1,1,5,10])}) predict = out[0].rename({0:'AR1'}, axis=1) lowPDO, df_lagmask = get_lagged_ts(rgPDO.df_data.copy(), 1, ['PDO0.5rm']) # perPDO = rgPDO.df_data[keys_ext][persmask['x_fit']] # persPDO[persmask['x_fit']] = persPDO[persmask['x_fit']] # perPDO.index = rgPDO.df_data[rgPDO.df_data['RV_mask']].index perPDO = lowPDO.rename({'PDO1.0rm_2':'persistence'}, axis=1) perPDO = perPDO.loc[df_prec.index] predict = predict.merge(perPDO, left_index=True, right_index=True) dates = core_pp.get_subdates(rgPDO.dates_TV, start_end_year=(1980,2020)) predict = predict.loc[pd.IndexSlice[:, dates], :] test = fc_utils.get_scores(predict, score_func_list=[fc_utils.corrcoef, fc_utils.metrics.mean_squared_error])[2] df_test = functions_pp.get_df_test(predict, df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']]) df_z = df_test[['AR1']] df_z = lowPDO # df_z = functions_pp.get_df_test(df_prec, # df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']]) # years = functions_pp.get_oneyr(df_z, *list(range(1980, 2020+1))) # df_z = df_z.loc[years] kwrgs_func = {'filepath':df_z, 'lag_z':0}
def cond_forecast_table(df_test, df_forcings, score_func_list, n_boot=0): quantiles = [.15, .25] metricsused = np.array([m.__name__ for m in score_func_list]) forcings = df_forcings.columns if forcings.size > 1: # loop over forcings n_boot = 0 index_level1 = forcings else: index_level1 = np.arange(n_boot) cond_df = np.zeros( (metricsused.size, index_level1.size, len(quantiles) * 2)) name_fc = 'test' for i, met in enumerate(metricsused): # j = 0 for j, col_forc in enumerate(forcings): df_forcing = df_forcings[col_forc] for k, l in enumerate(range(0, 4, 2)): q = quantiles[k] low = df_forcing < df_forcing.quantile(q) high = df_forcing > df_forcing.quantile(1 - q) mask_anomalous = np.logical_or(low, high) # anomalous Boundary forcing condfc = df_test[mask_anomalous.values] condfc = condfc.rename({'causal': name_fc}, axis=1) cond_verif_tuple_ano = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=seed) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple_ano rg.cond_verif_tuple_ano = cond_verif_tuple_ano if forcings.size > 1: cond_df[i, j, l] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, :, l] = df_boot[df_boot.columns[0][0]][met] # mild boundary forcing larger_low = df_forcing > df_forcing.quantile(.5 - q) smaller_high = df_forcing < df_forcing.quantile(.5 + q) mask_mild = np.logical_and(larger_low, smaller_high) condfc = df_test[mask_mild.values] condfc = condfc.rename({'causal': name_fc}, axis=1) cond_verif_tuple_mild = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=seed) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple_mild rg.cond_verif_tuple_mild = cond_verif_tuple_mild if forcings.size > 1: cond_df[i, j, l + 1] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, :, l + 1] = df_boot[df_boot.columns[0][0]][met] columns = [[f'strong {int(q*200)}%', f'weak {int(q*200)}%'] for q in quantiles] df_cond_fc = pd.DataFrame( cond_df.reshape((len(metricsused) * index_level1.size, -1)), index=pd.MultiIndex.from_product([list(metricsused), index_level1]), columns=functions_pp.flatten(columns)) return df_cond_fc
if monthkeys.index(month) == 0: weights_norm = weights.mean(axis=0, level=1) weights_norm.div(weights_norm.max(axis=0)).T.plot(kind='box') clim_mean_temp = float(target_ts.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE CRPSS = fc_utils.CRPSS_vs_constant_bench( constant_bench=clim_mean_temp).CRPSS score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( prediction, rg.df_data.iloc[:, -2:], score_func_list, n_boot=n_boot, blocksize=blocksize, rng_seed=1) df_test_m = df_test_m['Prediction'] df_boot = df_boot['Prediction'] # # Benchmark prediction # observed = pd.concat(n_splits*[target_ts], keys=range(n_splits)) # benchpred = observed.copy() # benchpred[:] = np.zeros_like(observed) # fake pred # benchpred = pd.concat([observed, benchpred], axis=1) # bench_MSE = fc_utils.get_scores(benchpred, # rg.df_data.iloc[:,-2:][rg.df_data.iloc[:,-1:].values].dropna(), # score_func_list,
MAE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] out = rg.fit_df_data_ridge(target=target_ts, keys=None, fcmodel=model, kwrgs_model=kwrgs_model, transformer=False, tau_min=1, tau_max=2) predict, weights, model_lags = out df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(predict, rg.df_data.iloc[:,-2:], score_func_list, n_boot = 100, score_per_test=False, blocksize=1, rng_seed=1) lag = 1 if prediction == 'events': print(model.scikitmodel.__name__, '\n', f'Test score at lag {lag}\n', 'BSS {:.2f}\n'.format(df_test_m.loc[0].loc[lag].loc['BSS']), 'AUC {:.2f}'.format(df_test_m.loc[0].loc[lag].loc['roc_auc_score']), '\nTrain score\n', 'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[lag]['BSS']), 'AUC {:.2f}'.format(df_train_m.mean(0).loc[lag]['roc_auc_score'])) elif prediction == 'continuous': print(model.scikitmodel.__name__, '\n', 'Test score\n', 'RMSE {:.2f}\n'.format(df_test_m.loc[0][lag]['RMSE']),
def forecast(rg, crossyr): # Forecasting pipeline 1 import func_models as fc_utils from stat_models_cont import ScikitModel import numpy as np from sklearn.linear_model import Ridge from sklearn.linear_model import LogisticRegressionCV # choose type prediciton (continuous or probabilistic) by making comment # prediction = 'continuous' # prediction = 'events' ; q = .66 # quantile threshold for event definition if prediction == 'continuous': model = ScikitModel(Ridge, verbosity=0) # You can also tune parameters by passing a list of values. Then GridSearchCV from sklearn will # find the set of parameters that give the best mean score on all kfold test sets. # below we pass a list of alpha's to tune the regularization. alphas = list( np.concatenate([[1E-20], np.logspace(-5, 0, 6), np.logspace(.01, 2.5, num=25)])) kwrgs_model = { 'scoringCV': 'neg_mean_absolute_error', 'kfold': 10, 'alpha': alphas } # large a, strong regul. elif prediction == 'events': model = ScikitModel(LogisticRegressionCV, verbosity=0) kwrgs_model = {'kfold': 5, 'scoring': 'neg_brier_score'} target_ts = rg.TV.RV_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() if prediction == 'events': q = 0.66 if q >= 0.5: target_ts = (target_ts > target_ts.quantile(q)).astype(int) elif q < .5: target_ts = (target_ts < target_ts.quantile(q)).astype(int) BSS = fc_utils.ErrorSkillScore( constant_bench=float(target_ts.mean())).BSS score_func_list = [BSS, fc_utils.metrics.roc_auc_score] elif prediction == 'continuous': RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float( target_ts.mean())).RMSE #RMSE ERROR SKILL SCORE MAE_SS = fc_utils.ErrorSkillScore( constant_bench=float(target_ts.mean())).MAE #MAE ERROR SKILL SCORE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] keys = [k for k in rg.df_data.columns[1:-2]] out = rg.fit_df_data_ridge(target=target_ts, keys=keys, fcmodel=model, kwrgs_model=kwrgs_model, transformer=None, tau_min=0, tau_max=0) # <- lag should be zero predict, weights, model_lags = out df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( predict, rg.df_data.iloc[:, -2:], score_func_list, n_boot=100, #intensive score_per_test=False, blocksize=1, rng_seed=1) lag = 0 if prediction == 'events': print( model.scikitmodel.__name__, '\n', f'Test score at lag {lag}\n', 'BSS {:.2f}\n'.format(df_test_m.loc[0].loc[0].loc['BSS']), 'AUC {:.2f}'.format(df_test_m.loc[0].loc[0].loc['roc_auc_score']), '\nTrain score\n', 'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['BSS']), 'AUC {:.2f}'.format(df_train_m.mean(0).loc[0]['roc_auc_score'])) elif prediction == 'continuous': print(model.scikitmodel.__name__, '\n', 'Test score\n', 'RMSE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['RMSE']), 'MAE_SS {:.2f}\n'.format(df_test_m.loc[0][0]['MAE']), 'corrcoef {:.2f}'.format(df_test_m.loc[0][0]['corrcoef']), '\nTrain score\n', 'RMSE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['RMSE']), 'MAE_SS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['MAE']), 'corrcoef {:.2f}'.format(df_train_m.mean(0).loc[0]['corrcoef'])) test_scores = [ df_test_m.loc[0][0]['RMSE'], df_test_m.loc[0][0]['MAE'], df_test_m.loc[0][0]['corrcoef'] ] train_scores = [ df_train_m.mean(0).loc[0]['RMSE'], df_train_m.mean(0).loc[0]['MAE'], df_train_m.mean(0).loc[0]['corrcoef'] ] return test_scores, train_scores, predict
rg.cluster_list_MI() rg.get_ts_prec() #%% fc_mask = rg.df_data.iloc[:, -1].loc[0] #.shift(lag, fill_value=False) # rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask)) target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask] target_ts = (target_ts - target_ts.mean()) / target_ts.std() alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) kwrgs_model = { 'scoring': 'neg_mean_squared_error', 'alphas': alphas, # large a, strong regul. 'normalize': False } keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']] keys = [k for k in keys if int(k.split('..')[0]) in [2]] # keys = [k for k in keys if int(k.split('..')[1]) in [1,3]] out_fit = rg.fit_df_data_ridge(target=target_ts, tau_min=2, tau_max=2, keys=keys, kwrgs_model=kwrgs_model) predict, weights, models_lags = out_fit df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( predict, score_func_list=[fc_utils.corrcoef, fc_utils.ErrorSkillScore(0).RMSE]) print(df_test_m) # rg.store_df(append_str='z500_'+'-'.join(map(str, z500_green_bb))+TV+str(cluster_label))
def cond_forecast_table(rg_list, score_func_list, n_boot=0): df_test_m = rg_list[0].verification_tuple[2] quantiles = [.15, .25] metrics = df_test_m.columns.levels[1] if n_boot > 0: cond_df = np.zeros( (metrics.size, len(rg_list), len(quantiles) * 2, n_boot)) else: cond_df = np.zeros((metrics.size, len(rg_list), len(quantiles) * 2)) for i, met in enumerate(metrics): for j, rg in enumerate(rg_list): PacAtl_ts = rg.df_forcing prediction = rg.prediction_tuple[0] df_test = functions_pp.get_df_test(prediction, df_splits=rg.df_splits) # df_test_m = rg.verification_tuple[2] # cond_df[i, j, 0] = df_test_m[df_test_m.columns[0][0]].loc[0][met] for k, l in enumerate(range(0, 4, 2)): q = quantiles[k] low = PacAtl_ts < PacAtl_ts.quantile(q) high = PacAtl_ts > PacAtl_ts.quantile(1 - q) mask_anomalous = np.logical_or(low, high) # anomalous Boundary forcing condfc = df_test[mask_anomalous.values] # condfc = condfc.rename({'causal':periodnames[i]}, axis=1) cond_verif_tuple = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=1) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple rg.cond_verif_tuple = cond_verif_tuple if n_boot == 0: cond_df[i, j, l] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, j, l, :] = df_boot[df_boot.columns[0][0]][met] # mild boundary forcing higher_low = PacAtl_ts > PacAtl_ts.quantile(.5 - q) lower_high = PacAtl_ts < PacAtl_ts.quantile(.5 + q) mask_anomalous = np.logical_and(higher_low, lower_high) # changed 11-5-21 condfc = df_test[mask_anomalous.values] # condfc = condfc.rename({'causal':periodnames[i]}, axis=1) cond_verif_tuple = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=1) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple if n_boot == 0: cond_df[i, j, l + 1] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, j, l + 1, :] = df_boot[df_boot.columns[0][0]][met] columns = [[f'strong {int(q*200)}%', f'weak {int(q*200)}%'] for q in quantiles] columns = functions_pp.flatten(columns) if n_boot > 0: columns = pd.MultiIndex.from_product([columns, list(range(n_boot))]) df_cond_fc = pd.DataFrame(cond_df.reshape( (len(metrics) * len(rg_list), -1)), index=pd.MultiIndex.from_product([ list(metrics), [rg.fc_month for rg in rg_list] ]), columns=columns) return df_cond_fc
keys=keys, tau_min=lag, tau_max=lag, kwrgs_model=kwrgs_model, transformer=fc_utils.standardize_on_train) predict, weights, models_lags = out prediction = predict.rename({predict.columns[0]:'target',lag:'Prediction'}, axis=1) if monthkeys.index(month)==0: weights_norm = weights.mean(axis=0, level=1) weights_norm.div(weights_norm.max(axis=0)).T.plot(kind='box') df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction, rg.df_data.iloc[:,-2:], score_func_list, n_boot = n_boot, blocksize=blocksize, rng_seed=1) # Benchmark prediction n_splits = rg.df_data.index.levels[0].size observed = pd.concat(n_splits*[target_ts], keys=range(n_splits)) benchpred = observed.copy() benchpred[:] = np.zeros_like(observed) # fake pred benchpred = pd.concat([observed, benchpred], axis=1) bench_MSE = fc_utils.get_scores(benchpred, rg.df_data.iloc[:,-2:], [metrics.mean_squared_error], n_boot = 0, blocksize=blocksize, rng_seed=1)[2]
fcmodel=fcmodel, transformer=None) predict, weights, models_lags = prediction_tuple prediction = predict.rename( { predict.columns[0]: target_dataset, lag_: fc_month }, axis=1) prediction_tuple = (prediction, weights, models_lags) list_prediction.append(prediction_tuple) rg.prediction_tuple = prediction_tuple verification_tuple = fc_utils.get_scores(prediction, rg.df_data.iloc[:, -2:], score_func_list, n_boot=n_boot, blocksize=1, rng_seed=seed) df_train_m, df_test_s_m, df_test_m, df_boot = verification_tuple m = models_lags[f'lag_{lag_}'][f'split_{0}'] # plt.plot(kwrgs_model['alpha'], m.cv_results_['mean_test_score']) # plt.axvline(m.best_params_['alpha']) ; plt.show() ; plt.close() list_verification.append(verification_tuple) rg.verification_tuple = verification_tuple #%% Plotting Continuous forecast df_preds_save = utils_paper3.df_predictions_for_plot(rg_list) d_dfs = {'df_predictions': df_preds_save}