def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False, n_boot: int=1): # alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) alphas = np.logspace(.1, 1.5, num=25) kwrgs_model = {'scoring':'neg_mean_absolute_error', 'alphas':alphas, # large a, strong regul. 'normalize':False} if target_ts is None: fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False) target_ts = df_data.iloc[:,[0]].loc[0][fc_mask] else: target_ts = target_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() out = rg.fit_df_data_ridge(df_data=df_data, target=target_ts, keys=keys, tau_min=min(lags), tau_max=max(lags), kwrgs_model=kwrgs_model, match_lag_region_to_lag_fc=match_lag, transformer=fc_utils.standardize_on_train) prediction, weights, models_lags = out # get skill scores clim_mean_temp = float(target_ts.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction, df_data.iloc[:,-2:], score_func_list, n_boot = n_boot, blocksize=blocksize, rng_seed=1) index = np.unique(core_pp.flatten([k.split('_') for k in keys])) AR = [l for l in index if '..' not in l] AR = [l for l in AR if 'PDO' not in l] index = [k for k in index if k not in AR] df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)] n_splits = df_data.index.levels[0].size # test for high alpha for col in df_test_m.columns.levels[0]: cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)] print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha))) maxalpha_c = list(cvfitalpha).count(alphas[-1]) if maxalpha_c > n_splits/3: print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}') print(f'{maxalpha_c} splits are max alpha\n') # maximum regularization selected. No information in timeseries # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 no_info_fc.append(col) df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:], left_index=True, right_index=True)).iloc[:,:-2] return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
def get_df_forcing_cond_fc(rg_list, target_ts, fcmodel, kwrgs_model, mean_vars=['sst', 'smi']): for j, rg in enumerate(rg_list): PacAtl = [] # find west-sub-tropical Atlantic region df_labels = find_precursors.labels_to_df(rg.list_for_MI[0].prec_labels) dlat = df_labels['latitude'] - 29 dlon = df_labels['longitude'] - 290 zz = pd.concat([dlat.abs(), dlon.abs()], axis=1) Atlan = zz.query('latitude < 10 & longitude < 10') if Atlan.size > 0: PacAtl.append(int(Atlan.index[0])) PacAtl.append(int(df_labels['n_gridcells'].idxmax())) # Pacific SST PacAtl = [int(df_labels['n_gridcells'].idxmax())] # only Pacific weights_norm = rg.prediction_tuple[1] # .mean(axis=0, level=1) # weights_norm = weights_norm.sort_values(ascending=False, by=0) keys = [ k for k in weights_norm.index.levels[1] if int(k.split('..')[1]) in PacAtl ] keys = [k for k in keys if 'sst' in k] # only SST labels = ['..'.join(k.split('..')[1:]) for k in keys] + [ '0..smi_sp' ] # add smi just because it almost always in there df_mean, keys_dict = get_df_mean_SST(rg, mean_vars=mean_vars, n_strongest='all', weights=True, fcmodel=fcmodel, kwrgs_model=kwrgs_model, target_ts=target_ts, labels=labels) # apply weighted mean based on coefficients of precursor regions weights_norm = weights_norm.loc[pd.IndexSlice[:, keys], :] # weights_norm = weights_norm.div(weights_norm.max(axis=0)) weights_norm = weights_norm.div(weights_norm.max(axis=0, level=0), level=0) weights_norm = weights_norm.reset_index().pivot(index='level_0', columns='level_1')[0] weights_norm.index.name = 'fold' df_mean.index.name = ('fold', 'time') PacAtl_ts = weights_norm.multiply(df_mean[keys], axis=1, level=0) PacAtl_ts = functions_pp.get_df_test(PacAtl_ts.mean(axis=1), df_splits=rg.df_splits) rg.df_forcing = PacAtl_ts
def prediction_wrapper(q): fcmodel = ScikitModel(scikitmodel=LogisticRegressionCV).fit kwrgs_model = { 'class_weight': { 0: 1, 1: 1 }, 'scoring': 'neg_brier_score', 'penalty': 'l2', 'solver': 'lbfgs' } lag = 4 keys = ['0..PEPsv'] #rg.df_data.columns[2:-2] keys = [k for k in rg.df_data.columns[2:-2] if 'sst' in k] target_ts = rg.TV_ts # - rg.TV_ts.mean()) / rg.TV_ts.std() # target_ts = rg.df_data_ext.loc[0][['mx2t']][rg.df_data.loc[0]['RV_mask']] target_ts = target_ts.to_dataframe('target')[['target']] target_ts.index.name = None target_ts = (target_ts > target_ts.quantile(q=q)).astype(int) out = rg.fit_df_data_ridge(target=target_ts, fcmodel=fcmodel, keys=keys, tau_min=0, tau_max=lag, kwrgs_model=kwrgs_model) prediction, weights, models_lags = out df_test = functions_pp.get_df_test( prediction.merge(rg.df_data.iloc[:, -2:].copy(), left_index=True, right_index=True)).iloc[:, :-2] # get skill scores clim_mean_temp = float(target_ts.mean()) SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp) BSS = SS.BSS score_func_list = [ metrics.roc_auc_score, BSS, fc_utils.ErrorSkillScore().AUC_SS ] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores( prediction, rg.df_data.iloc[:, -2:], score_func_list, score_per_test=False, n_boot=0, blocksize=2, rng_seed=1) return df_train_m, df_test_m, df_boot, df_test, models_lags, SS
ax = df_orig_midwest.plot(ax=ax, c='red', title='Red is orig csv mid-west spatial data mean') rg_always.df_fullts.plot(ax=ax, c='blue') f, ax = plt.subplots() ax = df_orig_all.plot(ax=ax, c='red', title='Red is orig csv all spatial data mean') rg_always.df_fullts.plot(ax=ax, c='blue') f, ax = plt.subplots() ax = df_USDA_midwest[['obs_yield']].plot(ax=ax, c='red', title='Red is USDA obs Beguería et al. 2020') rg_always.df_fullts.plot(ax=ax, c='blue') df_orig_midwest.plot(ax=ax) #%% filepath_RGCPD_hindcast = '/Users/semvijverberg/surfdrive/output_paper3/USDA_Soy_csv_midwest_bimonthly_random_10_s1_1950_2019/predictions_s1_continuous.h5' df_preds = functions_pp.load_hdf5(filepath_RGCPD_hindcast)['df_predictions'] df_preds = functions_pp.get_df_test(df_preds) ; df_preds.index.name='time' xr_obs = df_preds[['raw_target']].to_xarray().to_array().squeeze() trend = xr_obs - core_pp.detrend_lin_longterm(xr_obs) recon = df_preds.iloc[:,[0]] + trend.values[None,:].T #.values[1:][None,:].T + float(rg_always.df_fullts.mean()) ax = recon.plot() df_preds[['raw_target']].plot(ax=ax) #%% pred = df_preds[[0]] + trend.values[None,:].T ax = pred.plot() df_USDA_midwest[['frcst_aug_yield']].plot(ax=ax) df_preds[['raw_target']].plot(ax=ax) #%% f, ax = plt.subplots()
list_import_ts=[('PDO', z_filepath)], start_end_TVdate=('05-01', '08-01'), start_end_date=None, start_end_year=(1979+int(round(lowpass+0.49)), 2020), tfreq=2, path_outmain=path_out_main, append_pathsub='_' + exper) rgPDO.pp_TV(name_ds, anomaly=True, kwrgs_core_pp_time={'dailytomonths':True}) rgPDO.pp_precursors() rgPDO.traintest('random_10') rgPDO.get_ts_prec() # Predicting PDO at lag 1 vs start_end_data of RW PDO1, df_lagmask1 = get_lagged_ts(rgPDO.df_data.copy() , 0, keys_ext) target = functions_pp.get_df_test(PDO1, df_splits=rgPDO.df_data[['TrainIsTrue']].loc[PDO1.index]) PDO2, df_lagmask2 = get_lagged_ts(rgPDO.df_data.copy() , 2, keys_ext) # PDO3, df_lagmask3 = get_lagged_ts(rgPDO.df_data.copy() , 3, keys_ext) # PDO4, df_lagmask4 = get_lagged_ts(rgPDO.df_data.copy() , 4, keys_ext) # PDO5, df_lagmask5 = get_lagged_ts(rgPDO.df_data.copy() , 5, keys_ext) df_prec = PDO2 # AR1 model to predict PDO at lag 1 vs RW # df_prec = df_prec.merge(PDO3, left_index=True, right_index=True) # df_prec = df_prec.merge(PDO4, left_index=True, right_index=True) # df_prec = df_prec.merge(PDO5, left_index=True, right_index=True) out = rgPDO.fit_df_data_ridge(target=target, df_data = df_prec, tau_min=0, tau_max=0, kwrgs_model={'alphas':np.array([.01,.1,1,5,10])}) predict = out[0].rename({0:'AR1'}, axis=1)
# predictions temp using PDO df_precPDOs = merge_lagged_wrapper(rg.df_data.copy(), [1, 2], ['PDO0.5rm']) dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980, 2020)) df_precPDOs = df_precPDOs.loc[pd.IndexSlice[:, dates], :] df_precPDOs = df_precPDOs.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :], left_index=True, right_index=True) outPDOtemp = prediction_wrapper(df_precPDOs, lags=np.array([0]), target_ts=rg.TV.RV_ts.loc[dates], keys=None, match_lag=False, n_boot=n_boot) # predictions PDO using PDO target_PDO = functions_pp.get_df_test(rg.df_data.copy()[['PDO', 'TrainIsTrue' ]])[['PDO']] df_precPDOs = merge_lagged_wrapper(rg.df_data.copy(), [1], ['PDO']) dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980, 2020)) df_precPDOs = df_precPDOs.loc[pd.IndexSlice[:, dates], :] df_precPDOs = df_precPDOs.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :], left_index=True, right_index=True) outPDO = prediction_wrapper(df_precPDOs, lags=np.array([0]), target_ts=target_PDO.loc[dates], keys=None, match_lag=False, n_boot=n_boot) # Conditional forecast df_forcings = merge_lagged_wrapper(rg.df_data, [1], df_PDOs.columns)
n_splits = rg.df_data.index.levels[0].size cvfitalpha = [ models_lags[f'lag_{lag}'][f'split_{s}'].alpha_ for s in range(n_splits) ] print('mean alpha {:.2f}'.format(np.mean(cvfitalpha))) maxalpha_c = list(cvfitalpha).count(alphas[-1]) if maxalpha_c > n_splits / 3: print(f'\n{month} alpha {int(np.mean(cvfitalpha))}') print(f'{maxalpha_c} splits are max alpha\n') # maximum regularization selected. No information in timeseries # df_test_m['Prediction']['corrcoef'][:] = 0 # df_boot['Prediction']['corrcoef'][:] = 0 no_info_fc.append(month) df_test = functions_pp.get_df_test( prediction.merge(rg.df_data.iloc[:, -2:], left_index=True, right_index=True)).iloc[:, :2] else: print('no precursor timeseries found, scores all 0') df_boot = pd.DataFrame(data=np.zeros((n_boot, len(score_func_list))), columns=['RMSE', 'corrcoef', 'MAE']) df_test_m = pd.DataFrame(np.zeros((1, len(score_func_list))), columns=['RMSE', 'corrcoef', 'MAE']) list_test_b.append(df_boot) list_test.append(df_test_m) append_dict(month, df_test_m) # df_ana.loop_df(df=rg.df_data[keys], colwrap=1, sharex=False, # function=df_ana.plot_timeseries, # kwrgs={'timesteps':rg.fullts.size,
def get_scores(prediction, df_splits: pd.DataFrame = None, score_func_list: list = None, score_per_test=True, n_boot: int = 1, blocksize: int = 1, rng_seed=1): ''' Parameters ---------- prediction : TYPE DESCRIPTION. df_splits : pd.DataFrame, optional DESCRIPTION. The default is None. score_func_list : list, optional DESCRIPTION. The default is None. score_per_test : TYPE, optional DESCRIPTION. The default is True. n_boot : int, optional DESCRIPTION. The default is 1. blocksize : int, optional DESCRIPTION. The default is 1. rng_seed : TYPE, optional DESCRIPTION. The default is 1. Returns ------- pd.DataFrames format: index [opt. splits] Multi-index columns [lag, metric name] df_trains, df_test_s, df_tests, df_boots. ''' #%% if df_splits is None: # assuming all is test data TrainIsTrue = np.zeros((prediction.index.size, 1)) RV_mask = np.ones((prediction.index.size, 1)) df_splits = pd.DataFrame(np.concatenate([TrainIsTrue, RV_mask], axis=1), index=prediction.index, dtype=bool, columns=['TrainIsTrue', 'RV_mask']) # add empty multi-index to maintain same data format if hasattr(df_splits.index, 'levels') == False: df_splits = pd.concat([df_splits], keys=[0]) if hasattr(prediction.index, 'levels') == False: prediction = pd.concat([prediction], keys=[0]) pred = prediction.merge(df_splits, left_index=True, right_index=True) # score on train and per test split if score_func_list is None: score_func_list = [metrics.mean_squared_error, corrcoef] splits = pred.index.levels[0] columns = prediction.columns[1:] df_trains = np.zeros((columns.size), dtype=object) df_tests_s = np.zeros((columns.size), dtype=object) for c, col in enumerate(columns): df_train = pd.DataFrame(np.zeros((splits.size, len(score_func_list))), columns=[f.__name__ for f in score_func_list]) df_test_s = pd.DataFrame(np.zeros((splits.size, len(score_func_list))), columns=[f.__name__ for f in score_func_list]) for s in splits: sp = pred.loc[s] trainRV = np.logical_and(sp['TrainIsTrue'], sp['RV_mask']) testRV = np.logical_and(~sp['TrainIsTrue'], sp['RV_mask']) for f in score_func_list: name = f.__name__ if (~trainRV).all() == False: # training data exists train_score = f(sp[trainRV].iloc[:, 0], sp[trainRV].loc[:, col]) else: train_score = np.nan if score_per_test and testRV.any(): test_score = f(sp[testRV].iloc[:, 0], sp[testRV].loc[:, col]) else: test_score = np.nan df_train.loc[s, name] = train_score df_test_s.loc[s, name] = test_score df_trains[c] = df_train df_tests_s[c] = df_test_s df_trains = pd.concat(df_trains, keys=columns, axis=1) df_tests_s = pd.concat(df_tests_s, keys=columns, axis=1) # score on complete test df_tests = np.zeros((columns.size), dtype=object) pred_test = functions_pp.get_df_test(pred).iloc[:, :-2] if pred_test.size != 0: # ensure test data is available for c, col in enumerate(columns): df_test = pd.DataFrame( np.zeros((1, len(score_func_list))), columns=[f.__name__ for f in score_func_list]) for f in score_func_list: name = f.__name__ y_true = pred_test.iloc[:, 0] y_pred = pred_test.loc[:, col] df_test[name] = f(y_true, y_pred) df_tests[c] = df_test df_tests = pd.concat(df_tests, keys=columns, axis=1) # Bootstrapping with replacement df_boots = np.zeros((columns.size), dtype=object) if pred_test.size != 0: # ensure test data is available for c, col in enumerate(columns): old_index = range(0, len(y_true), 1) n_bl = blocksize chunks = [ old_index[n_bl * i:n_bl * (i + 1)] for i in range(int(len(old_index) / n_bl)) ] score_list = _bootstrap(pred_test.iloc[:, [0, c + 1]], n_boot, chunks, score_func_list, rng_seed=rng_seed) df_boot = pd.DataFrame( score_list, columns=[f.__name__ for f in score_func_list]) df_boots[c] = df_boot df_boots = pd.concat(df_boots, keys=columns, axis=1) out = (df_trains, df_tests_s, df_tests, df_boots) #%% return out
#%% fc_months_periodnames = { 'August': 'JJ', 'July': 'MJ', 'June': 'AM', 'May': 'MA', 'April': 'FM', 'March': 'JF', 'December': 'SO', 'February': 'DJ' } filepath_df_output = os.path.join( path_input_main, f'df_output_{fc_months_periodnames[fc_month]}.h5') df_output = functions_pp.load_hdf5(filepath_df_output) df_data = df_output['df_data'] df_splits = df_data.iloc[:, -2:] out = utils_paper3.load_scores(['Target'], model_name, model_name, 2000, filepath_df_datas, condition='strong 50%') df_scores, df_boots, df_preds = out df_test_m = [d[fc_month] for d in df_scores] df_boots_list = [d[fc_month] for d in df_boots] df_test = df_preds[0][['Target', fc_month]] df_test = functions_pp.get_df_test(df_test, df_splits=df_splits)
def parallel(cluster, month, agg_level, n_lags, kwrgs_MI, fold_method, row_arrays, column_array, subfolder): #%% print(f'Starting cluster {cluster}, prediciting {month}') #get list_of_name_path list_of_name_path = get_list_of_name_path(agg_level, cluster) #run define rg, list_for_MI, lags, crossyr = define(list_of_name_path, month, n_lags, kwrgs_MI, subfolder) #run check (possible, not necessary) #check(rg, list_of_name_path, cluster) #run processing rg = process(rg, lags, fold_method, crossyr) #run forecast test_scores, train_scores, prediction = forecast(rg, crossyr) #store skill score results in df_ss_result dataframe df_ss_result = pd.DataFrame(np.zeros( (len(row_arrays[0]), len(column_array)), dtype=float), index=row_arrays, columns=column_array) for count, i in enumerate( row_idx_2_arr[:6] ): #always loop over test test test train train train per cluster if count < 3: df_ss_result.loc[ (cluster, i, row_idx_3_arr[count]), all_targetperiods_dict[month]] = test_scores[count] else: df_ss_result.loc[(cluster, i, row_idx_3_arr[count]), all_targetperiods_dict[month]] = train_scores[ count - 3] #get test df actual and predictions test_df_pred = functions_pp.get_df_test(prediction, df_splits=pd.DataFrame( rg.df_data.iloc[:, -2:])) #update dates delta = int(month[0][:2]) - 1 date_list = test_df_pred.index.get_level_values(0).shift(delta, freq='MS') test_df_pred.set_index([date_list], inplace=True) #change column header of prediction to RV#ts_pred new_columns = test_df_pred.columns.values new_columns[1] = new_columns[0] + '_pred' test_df_pred.columns = new_columns #save intermediate cluster csv results_path = os.path.join( main_dir, 'Results', 'skillscores', f'{agg_level}_{fold_method}') #path of results os.makedirs(results_path, exist_ok=True) # make folder if it doesn't exist df_ss_result.to_csv( os.path.join( results_path, str(cluster) + '_' + str(all_targetperiods_dict[month]) + '_ss_scores_' + agg_level + '.csv')) #intermediate save skillscores per cluster to csv #%% return df_ss_result, test_df_pred, rg
def cond_forecast_table(rg_list, score_func_list, n_boot=0): df_test_m = rg_list[0].verification_tuple[2] quantiles = [.15, .25] metrics = df_test_m.columns.levels[1] if n_boot > 0: cond_df = np.zeros( (metrics.size, len(rg_list), len(quantiles) * 2, n_boot)) else: cond_df = np.zeros((metrics.size, len(rg_list), len(quantiles) * 2)) for i, met in enumerate(metrics): for j, rg in enumerate(rg_list): PacAtl_ts = rg.df_forcing prediction = rg.prediction_tuple[0] df_test = functions_pp.get_df_test(prediction, df_splits=rg.df_splits) # df_test_m = rg.verification_tuple[2] # cond_df[i, j, 0] = df_test_m[df_test_m.columns[0][0]].loc[0][met] for k, l in enumerate(range(0, 4, 2)): q = quantiles[k] low = PacAtl_ts < PacAtl_ts.quantile(q) high = PacAtl_ts > PacAtl_ts.quantile(1 - q) mask_anomalous = np.logical_or(low, high) # anomalous Boundary forcing condfc = df_test[mask_anomalous.values] # condfc = condfc.rename({'causal':periodnames[i]}, axis=1) cond_verif_tuple = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=1) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple rg.cond_verif_tuple = cond_verif_tuple if n_boot == 0: cond_df[i, j, l] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, j, l, :] = df_boot[df_boot.columns[0][0]][met] # mild boundary forcing higher_low = PacAtl_ts > PacAtl_ts.quantile(.5 - q) lower_high = PacAtl_ts < PacAtl_ts.quantile(.5 + q) mask_anomalous = np.logical_and(higher_low, lower_high) # changed 11-5-21 condfc = df_test[mask_anomalous.values] # condfc = condfc.rename({'causal':periodnames[i]}, axis=1) cond_verif_tuple = fc_utils.get_scores( condfc, score_func_list=score_func_list, n_boot=n_boot, score_per_test=False, blocksize=1, rng_seed=1) df_train_m, df_test_s_m, df_test_m, df_boot = cond_verif_tuple if n_boot == 0: cond_df[i, j, l + 1] = df_test_m[df_test_m.columns[0][0]].loc[0][met] else: cond_df[i, j, l + 1, :] = df_boot[df_boot.columns[0][0]][met] columns = [[f'strong {int(q*200)}%', f'weak {int(q*200)}%'] for q in quantiles] columns = functions_pp.flatten(columns) if n_boot > 0: columns = pd.MultiIndex.from_product([columns, list(range(n_boot))]) df_cond_fc = pd.DataFrame(cond_df.reshape( (len(metrics) * len(rg_list), -1)), index=pd.MultiIndex.from_product([ list(metrics), [rg.fc_month for rg in rg_list] ]), columns=columns) return df_cond_fc
prediction, rg.df_data.iloc[:, -2:], score_func_list, n_boot=n_boot, blocksize=1, rng_seed=seed) m = models_lags[f'lag_{lag_}'][f'split_{0}'] cvfitalpha = [ models_lags[f'lag_{lag_}'][f'split_{s}'].alpha_ for s in range(n_spl) ] if kwrgs_model['alphas'].max() in cvfitalpha: print('Max a reached') if kwrgs_model['alphas'].min() in cvfitalpha: print('Min a reached') # assert kwrgs_model['alphas'].min() not in cvfitalpha, 'decrease min a' df_test = functions_pp.get_df_test(predict.rename({lag_: 'causal'}, axis=1), df_splits=rg.df_splits) print(df_test_m) #%% from matplotlib import gridspec from matplotlib.offsetbox import TextArea, VPacker, AnnotationBbox fontsize = 16 fig = plt.figure(figsize=(12, 5)) gs = gridspec.GridSpec(1, 1, height_ratios=None) facecolor = 'white' ax0 = plt.subplot(gs[0], facecolor=facecolor) # df_test.plot(ax=ax0) ax0.plot_date(df_test.index, df_test[target_dataset],