def CPPA_precursor_regions(path_data, keys_options=['CPPA']): #%% dict_of_dfs = functions_pp.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] skip = ['TrainIsTrue', 'RV_mask'] keys_d = {} for option in keys_options: keys_d_ = {} for s in splits: if option == 'robust': not_robust = ['0_101_PEPspatcov', 'PDO', 'ENSO_34', 'ENSO_34', 'PDO'] all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] all_keys = [k for k in all_keys if k not in not_robust] robust = ['0_100_CPPAspatcov', '2', '7', '9' ] sst_regs = [k for k in all_keys if len(k.split('_')) == 3] other = [k for k in all_keys if len(k.split('_')) != 3] keys_ = [k for k in sst_regs if k.split('_')[1] in robust ] [keys_.append(k) for k in other] elif option == 'CPPA': not_robust = ['0_101_PEPspatcov', '0_104_PDO', '0_103_ENSO34', 'ENSO_34', 'PDO', '0_900_ENSO34', '0_901_PDO'] all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] all_keys = [k for k in all_keys if k not in not_robust] keys_ = all_keys elif option == 'PEP': all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] keys_ = [k for k in all_keys if k.split('_')[-1] == 'PEPspatcov'] keys_d_[s] = np.unique(keys_) keys_d[option] = keys_d_ #%% return keys_d
f, ax = plt.subplots() ax = df_orig_midwest.plot(ax=ax, c='red', title='Red is orig csv mid-west spatial data mean') rg_always.df_fullts.plot(ax=ax, c='blue') f, ax = plt.subplots() ax = df_orig_all.plot(ax=ax, c='red', title='Red is orig csv all spatial data mean') rg_always.df_fullts.plot(ax=ax, c='blue') f, ax = plt.subplots() ax = df_USDA_midwest[['obs_yield']].plot(ax=ax, c='red', title='Red is USDA obs BeguerÃa et al. 2020') rg_always.df_fullts.plot(ax=ax, c='blue') df_orig_midwest.plot(ax=ax) #%% filepath_RGCPD_hindcast = '/Users/semvijverberg/surfdrive/output_paper3/USDA_Soy_csv_midwest_bimonthly_random_10_s1_1950_2019/predictions_s1_continuous.h5' df_preds = functions_pp.load_hdf5(filepath_RGCPD_hindcast)['df_predictions'] df_preds = functions_pp.get_df_test(df_preds) ; df_preds.index.name='time' xr_obs = df_preds[['raw_target']].to_xarray().to_array().squeeze() trend = xr_obs - core_pp.detrend_lin_longterm(xr_obs) recon = df_preds.iloc[:,[0]] + trend.values[None,:].T #.values[1:][None,:].T + float(rg_always.df_fullts.mean()) ax = recon.plot() df_preds[['raw_target']].plot(ax=ax) #%% pred = df_preds[[0]] + trend.values[None,:].T ax = pred.plot() df_USDA_midwest[['frcst_aug_yield']].plot(ax=ax) df_preds[['raw_target']].plot(ax=ax) #%%
def import_precur_ts(list_import_ts: List[tuple], df_splits: pd.DataFrame, start_end_date: Tuple[str, str], start_end_year: Tuple[int, int], start_end_TVdate: Tuple[str, str], cols: list = None, precur_aggr: int = 1): ''' list_import_ts has format List[tuples], [(name, path_data)] ''' #%% # df_splits = rg.df_splits splits = df_splits.index.levels[0] orig_traintest = functions_pp.get_testyrs(df_splits) df_data_ext_s = np.zeros((splits.size), dtype=object) counter = 0 for i, (name, path_data) in enumerate(list_import_ts): df_data_e_all = functions_pp.load_hdf5(path_data)['df_data'] if type(df_data_e_all) is pd.Series: df_data_e_all = pd.DataFrame(df_data_e_all) df_data_e_all = df_data_e_all.iloc[:, :] # not sure why needed if cols is None: cols = list( df_data_e_all.columns[(df_data_e_all.dtypes != bool).values]) elif type(cols) is str: cols = [cols] if hasattr(df_data_e_all.index, 'levels'): dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:, dates_subset], :] else: dates_subset = core_pp.get_subdates(df_data_e_all.index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[dates_subset] if 'TrainIsTrue' in df_data_e_all.columns: _c = [ k for k in df_splits.columns if k in ['TrainIsTrue', 'RV_mask'] ] # check if traintest split is correct ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c]) _check_traintest = all( np.equal(core_pp.flatten(ext_traintest), core_pp.flatten(orig_traintest))) assert _check_traintest, ( 'Train test years of df_splits are not the ' 'same as imported timeseries') for s in range(splits.size): if 'TrainIsTrue' in df_data_e_all.columns: df_data_e = df_data_e_all.loc[s] else: df_data_e = df_data_e_all df_data_ext_s[s] = df_data_e[cols] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if precur_aggr != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins( df_data_ext_s[s], precur_aggr, start_end_date, start_end_year, start_end_TVdate=start_end_TVdate)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format( str(e))) print(f'loaded in exterinal timeseres: {cols}') if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size)) df_data_ext = df_data_ext.merge(df_add, left_index=True, right_index=True) counter += 1 cols = None #%% return df_data_ext
path_out_main = os.path.join(main_dir, f'publications/NPJ_2021/output/{west_east}_parcorrmaps') if os.path.isdir(path_out_main) != True: os.makedirs(path_out_main) cluster_label = '' # 'z500' # name_or_cluster_label = '' # name_ds = west_east + 'RW' # f'0..0..{name_or_cluster_label}_sp' name_or_cluster_label = 'z500' name_ds = f'0..0..{name_or_cluster_label}_sp' start_end_date = ('1-1', start_end_TVdate[-1]) filepath_df_PDOs = os.path.join(path_data, 'df_PDOs_monthly.h5') filepath_df_ENSO = os.path.join(path_data, 'df_ENSOs_monthly.h5') #%% Get PDO and apply low-pass filter if 'parcorr' == exper: try: df_PDOs = functions_pp.load_hdf5(filepath_df_PDOs)['df_data'] except: SST_pp_filepath = user_dir + '/surfdrive/ERA5/input_raw/preprocessed/sst_1979-2020_jan_dec_monthly_1.0deg.nc' if 'df_ENSO' not in globals(): df_PDO, PDO_patterns = climate_indices.PDO(SST_pp_filepath, None) PDO_plot_kwrgs = {'units':'[-]', 'cbar_vert':-.1, # 'zoomregion':(130,260,20,60), 'map_proj':ccrs.PlateCarree(central_longitude=220), 'y_ticks':np.array([25,40,50,60]), 'x_ticks':np.arange(130, 280, 25), 'clevels':np.arange(-.6,.61,.075), 'clabels':np.arange(-.6,.61,.3), 'subtitles':np.array([['PDO loading pattern']])}
combinations = np.array(np.meshgrid(methods, seeds)).T.reshape(-1, 2) metrics = ['corrcoef', 'MAE', 'RMSE'] np_out = np.zeros((len(metrics), combinations.shape[0], 4)) for i, (method, s) in enumerate(combinations): path = os.path.join(path_input, f'{method}') hash_str = f'cond_fc_{method}_s{s}.h5' f_name = None for root, dirs, files in os.walk(path): for file in files: if re.findall(f'{hash_str}', file): print(f'Found file {file}') f_name = file if f_name is not None: d_dfs = functions_pp.load_hdf5(os.path.join(path, f's{s}', f_name))['df_cond_fc'] np_out[0][i] = d_dfs.loc[metrics[0]].loc[month] np_out[1][i] = d_dfs.loc[metrics[1]].loc[month] np_out[2][i] = d_dfs.loc[metrics[2]].loc[month] #%% import matplotlib as mpl mpl.rcParams.update(mpl.rcParamsDefault) save = False df_cond_fc = pd.DataFrame(np_out.reshape((-1, np_out.shape[-1])), index=pd.MultiIndex.from_product([ metrics, [c[0] + '_' + c[1] for c in combinations] ]), columns=d_dfs.columns) plot_cols = ['strong 50%', 'weak 50%']
f'ETC {int(ETC/60)} min \t Progress {int(100*(time()-t00)/ETC)}% ') # In[8]: working_folder, filename = list_of_fc[0]._print_sett(list_of_fc=list_of_fc) store = False if __name__ == "__main__": filename = list_of_fc[0].filename store = True import valid_plots as dfplots import functions_pp dict_all = dfplots.merge_valid_info(list_of_fc, store=store) if store: dict_merge_all = functions_pp.load_hdf5(filename + '.h5') lag_rel = 50 kwrgs = { 'wspace': 0.16, 'hspace': .25, 'col_wrap': 2, 'skip_redundant_title': True, 'lags_relcurve': [lag_rel], 'fontbase': 14, 'figaspect': 2 } #kwrgs = {'wspace':0.25, 'col_wrap':3, 'threshold_bin':fc.threshold_pred} met = ['AUC-ROC', 'AUC-PR', 'Precision', 'BSS', 'Accuracy', 'Rel. Curve'] line_dim = 'exper' group_line_by = None
#%% fc_months_periodnames = { 'August': 'JJ', 'July': 'MJ', 'June': 'AM', 'May': 'MA', 'April': 'FM', 'March': 'JF', 'December': 'SO', 'February': 'DJ' } filepath_df_output = os.path.join( path_input_main, f'df_output_{fc_months_periodnames[fc_month]}.h5') df_output = functions_pp.load_hdf5(filepath_df_output) df_data = df_output['df_data'] df_splits = df_data.iloc[:, -2:] out = utils_paper3.load_scores(['Target'], model_name, model_name, 2000, filepath_df_datas, condition='strong 50%') df_scores, df_boots, df_preds = out df_test_m = [d[fc_month] for d in df_scores] df_boots_list = [d[fc_month] for d in df_boots] df_test = df_preds[0][['Target', fc_month]] df_test = functions_pp.get_df_test(df_test, df_splits=df_splits)
threshold_pred=(1.5, 'times_clim')) # In[8]: working_folder, pathexper = list_of_fc[0]._print_sett(list_of_fc=list_of_fc) store = False if __name__ == "__main__": pathexper = list_of_fc[0].pathexper store = True import valid_plots as dfplots import functions_pp dict_all = dfplots.merge_valid_info(list_of_fc, store=store) if store: dict_merge_all = functions_pp.load_hdf5(pathexper + '/data.h5') kwrgs = { 'wspace': 0.15, 'col_wrap': None, 'skip_redundant_title': True, 'lags_relcurve': [10, 20] } #kwrgs = {'wspace':0.25, 'col_wrap':3, 'threshold_bin':fc.threshold_pred} met = ['AUC-ROC', 'AUC-PR', 'BSS', 'Rel. Curve'] #met = ['AUC-ROC', 'AUC-PR', 'BSS', 'Rel. Curve'] line_dim = None group_line_by = 'dataset' # line_dim = 'exper' ; group_line_by = None fig = dfplots.valid_figures(dict_merge_all,
# (1) random_{int} : with the int(ex['method'][6:8]) determining the amount of folds # (2) ranstrat_{int}: random stratified folds, stratified based upon events, # requires kwrgs_events. # (3) leave_{int} : chronologically split train and test years. # (4) split_{int} : (should be updated) split dataset into single train and test set # (5) no_train_test_split or False 'random_#' col_wrap = 3 #3 months next to each other in figure n_jobs = 3 #--------------------------------------------------------------------------------------------------------------------# #LOOP #--------------------------------------------------------------------------------------------------------------------# results_path = os.path.join(main_dir, 'Results', 'skillscores', f'{agg_level}_{fold_method}') #path of results path_df_ss_result = os.path.join(results_path, 'df_skill_predictions.h5') if os.path.isfile(path_df_ss_result): df_data = functions_pp.load_hdf5(path_df_ss_result) df_ss_result = df_data['df_ss_result'] df_prediction_result = df_data['df_prediction_result'] else: df_ss_result, df_prediction_result, rg = loop_analysis( agg_level, n_lags, kwrgs_MI, fold_method, n_jobs=n_jobs, distinct_cl=cluster_numbers, distinct_targetperiods=TV_targetperiod) print(df_ss_result, '\n', df_prediction_result) #--------------------------------------------------------------------------------------------------------------------# #PLOT #--------------------------------------------------------------------------------------------------------------------#
def normal_precursor_regions(path_data, keys_options=['all'], causal=False): #%% ''' keys_options=['all', 'only_db_regs', 'sp_and_regs', 'sst+sm+RWT', 'CPPA+sm', 'sst(PEP)+sm', 'sst(PDO,ENSO)+sm', 'CPPA', 'PEP', 'sst combined', 'sst combined + sm', 'sst(CPPA) expert knowledge', 'sst(CPPA Pattern)' 'CPPA Pattern', 'PDO+ENSO', ' ', 'CPPA+PEP+sm', 'PEP+sm'] ''' dict_of_dfs = functions_pp.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] try: df_sum = dict_of_dfs['df_sum'] except: pass # skip = ['all_spatcov', '0_2_sm123', '0_101_PEPspatcov', 'sm123_spatcov'] skip = ['all_spatcov'] keys_d = {} for option in keys_options: keys_d_ = {} for s in splits: if causal == True or 'causal' in option: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False and 'causal' not in option: # correlated df_s = df_data.loc[s] all_keys = df_s.columns.delete(0) # extract only float columns mask_f = np.logical_or(df_s.dtypes == 'float64', df_s.dtypes == 'float32') all_keys = all_keys[mask_f[1:].values] # remove spatcov_causals all_keys = [k for k in all_keys if k[-4:] != 'caus'] if option == 'all': # extract only float columns keys_ = [k for k in all_keys if k not in skip] elif 'only_db_regs' in option: # Regions + all_spatcov(_caus) keys_ = [k for k in all_keys if ('spatcov' not in k)] keys_ = [k for k in keys_ if k not in skip] elif option == 'sp_and_regs': keys_ = [k for k in all_keys if k not in skip] elif option == 'CPPA': skip_ex = ['0..103..PEPsv', 'sm123_spatcov', 'all_spatcov'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if 'sm' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k]# or 'PDO' not in k] keys_ = [k for k in keys_ if 'PDO' not in k] keys_ = [k for k in keys_ if 'PEPsv' not in k] keys_ = [k for k in keys_ if 'OLR' not in k] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'sst combined': keys_ = [k for k in all_keys if 'sm' not in k] elif option == 'sst combined+sm': keys_ = all_keys elif option == 'sst(CPPA Pattern)' or option == 'CPPA Pattern': keys_ = [k for k in all_keys if 'CPPAsv' in k] elif option == 'sst+sm+z500': keys_ = [] keys_.append([k for k in all_keys if '..sst' in k]) keys_.append([k for k in all_keys if '..sm' in k]) keys_.append([k for k in all_keys if '..z500' in k]) keys_ = flatten(keys_) elif option == 'CPPA+sm': keys_ = [k for k in all_keys if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'PEP' not in k] keys_ = [k for k in keys_ if 'OLR' not in k] keys_ = [k for k in keys_ if k not in skip] elif option == 'CPPA+PEP+sm': keys_ = [k for k in all_keys if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] elif option == 'CPPApr+PEP+sm': keys_ = [k for k in all_keys if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'CPPAsv' not in k] elif option == 'CPPA+sm+OLR': keys_ = [k for k in all_keys if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'PEP' not in k] keys_ = [k for k in keys_ if k not in skip] elif option == 'CPPAregs+sm': keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'PEP' not in k] keys_ = [k for k in keys_ if ('CPPAsv' not in k)] elif option == 'CPPApattern+sm': skip_ex = ['0..100..ENSO34','0..101..PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'PEP' not in k] keys_ = [k for k in keys_ if 'OLR' not in k] keys_ = [k for k in keys_ if ('spatcov' in k or 'sm' in k)] elif option == 'sm': keys_ = [k for k in all_keys if 'sm' in k] keys_ = [k for k in keys_ if 'spatcov' not in k] elif option == 'sst(PEP)+sm': keys_ = [k for k in all_keys if 'sm' in k or 'PEP' in k] keys_ = [k for k in keys_ if k != 'sm123_spatcov'] elif option == 'PEP': keys_ = [k for k in all_keys if 'PEP' in k] elif option == 'PEP+sm': keys_ = [k for k in all_keys if 'PEP' in k] keys_ = [k for k in keys_ if 'sm' in k] elif option == 'sst(PDO,ENSO)+sm': keys_ = [k for k in all_keys if 'sm' in k or 'PDO' in k or 'ENSO' in k] keys_ = [k for k in keys_ if 'spatcov' not in k] elif option == 'PDO+ENSO': keys_ = [k for k in all_keys if 'PDO' in k or 'ENSO' in k] keys_ = [k for k in keys_ if 'spatcov' not in k] elif option == 'sst(CPPA) expert knowledge': keys_ = [k for k in all_keys if 'sm' not in k] keys_ = [k for k in keys_ if 'PDO' not in k] keys_ = [k for k in keys_ if 'ENSO' not in k] keys_ = [k for k in keys_ if 'PEP' not in k] expert = ['CPPAsv', '..9..sst', '..2..sst', '..6..sst', '..1..sst', '..7..sst'] keys_ = [k for k in keys_ for e in expert if e in k] if option == ' ': keys_ = [] keys_d_[s] = np.unique(keys_) keys_d[option] = keys_d_ #%% return keys_d
def compare_use_spatcov(path_data, causal=True): #%% dict_of_dfs = functions_pp.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] df_sum = dict_of_dfs['df_sum'] keys_d = {} keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # Regions + all_spatcov(_caus) keys_ = [k for k in all_keys if ('spatcov_caus' in k) and k[:3] == 'all'] keys_d_[s] = np.array((keys_)) keys_d['Only_all_spatcov'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # Regions + all_spatcov(_caus) keys_ = [k for k in all_keys if ('spatcov_caus' not in k) or k[:3] == 'all'] keys_d_[s] = np.array((keys_)) keys_d['Regions_all_spatcov'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # only spatcov(_caus) (no all_spatcov) keys_ = [k for k in all_keys if ('spatcov' in k) and ('all' not in k)] keys_d_[s] = np.array((keys_)) keys_d['only_sp_caus_no_all_sp'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # only spatcov(_caus) (no all_spatcov) keys_ = [k for k in all_keys if ('all' not in k)] keys_d_[s] = np.array((keys_)) keys_d['Regions_sp_caus_no_all_sp'] = keys_d_ #%% return keys_d
#%% Plotting Continuous forecast df_preds_save = utils_paper3.df_predictions_for_plot(rg_list) d_dfs = {'df_predictions': df_preds_save} filepath_dfs = os.path.join(rg.path_outsub1, f'predictions_s{seed}_continuous.h5') functions_pp.store_hdf_df(d_dfs, filepath_dfs) df_scores, df_boot, df_tests = utils_paper3.df_scores_for_plot( rg_list, name_object='verification_tuple') d_dfs = {'df_scores': df_scores, 'df_boot': df_boot, 'df_tests': df_tests} filepath_dfs = os.path.join(rg.path_outsub1, f'scores_s{seed}_continuous.h5') functions_pp.store_hdf_df(d_dfs, filepath_dfs) d_dfs = functions_pp.load_hdf5(filepath_dfs) f = utils_paper3.plot_scores_wrapper(df_scores, df_boot) f_name = f'{method}_{seed}_cf_PacAtl' fig_path = os.path.join(rg.path_outsub1, f_name) + rg.figext if save: f.savefig(fig_path, bbox_inches='tight') for rg in rg_list: # plotting score per test # plot timeseries predict = rg.prediction_tuple[0] df_test = functions_pp.get_df_test(predict.rename({lag_: 'causal'}, axis=1), df_splits=rg.df_splits) df_test_m = rg.verification_tuple[2] utils_paper3.plot_forecast_ts(df_test_m, df_test)