def store_ts(df_data, df_sum, dict_ds, outdic_actors, ex, add_spatcov=True): today = datetime.datetime.today().strftime('%Y-%m-%d') file_name = 'fulldata_{}_{}'.format(ex['params'], today) ex['path_data'] = os.path.join(ex['fig_subpath'], file_name + '.h5') if add_spatcov: df_sp_s = np.zeros((ex['n_spl']), dtype=object) for s in range(ex['n_spl']): df_split = df_data.loc[s] df_sp_s[s] = rgcpd.get_spatcovs(dict_ds, df_split, s, outdic_actors, normalize=True) df_sp = pd.concat(list(df_sp_s), keys=range(ex['n_spl'])) df_data_to_store = pd.merge(df_data, df_sp, left_index=True, right_index=True) df_sum_to_store = rgcpd.add_sp_info(df_sum, df_sp) else: df_data_to_store = df_data df_sum_to_store = df_sum dict_of_dfs = {'df_data': df_data_to_store, 'df_sum': df_sum_to_store} if ex['store_format'] == 'hdf5': functions_pp.store_hdf_df(dict_of_dfs, ex['path_data']) print('Data stored in \n{}'.format(ex['path_data'])) return
def store_ts(df_data, df_sum, dict_ds, filename): # outdic_precur, add_spatcov=True import functions_pp df_data_to_store = df_data df_sum_to_store = df_sum dict_of_dfs = {'df_data': df_data_to_store, 'df_sum': df_sum_to_store} functions_pp.store_hdf_df(dict_of_dfs, filename) print('Data stored in \n{}'.format(filename)) return
#%% Get timeseries at specific points within gridcell ds_t2m = core_pp.import_ds_lazy(var_filename, selbox=selbox) npts = np.zeros((np_array_xy.shape[0], ds_t2m.time.size)) for i, xy in enumerate(np_array_xy): npts[i] = ds_t2m.sel(longitude=(180 + (180 + xy[0])), latitude=xy[1]) columns = [f'{abs(c[0])}W-{c[1]}N' for c in np_array_xy] df_ts = pd.DataFrame(npts.T, index=pd.to_datetime(ds_t2m.time.values), columns=columns) TVpath = os.path.join( user_dir, 'surfdrive/Scripts/RGCPD/publications/NPJ_2021/data/', 'df_ts_paper2_clustercorr_{}.h5'.format(xrclustered.attrs['hash'])) functions_pp.store_hdf_df({'df_ts': df_ts}, file_path=TVpath) #%% Calculate corr maps list_xr = [] for point in df_ts.columns: list_of_name_path = [ ('', TVpath), ('t2m', root_data + '/input_raw/t2m_US_1979-2020_1_12_daily_0.25deg.nc') ] list_for_MI = [ BivariateMI(name='t2m', func=class_BivariateMI.corr_map, alpha=.05, FDR_control=True, lags=np.array([0]))
index=dates, name=f'PDO{yr}bw') ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass', color='red',linestyle=ls[i], linewidth=1, marker=None) df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean() df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1) ax.plot_date(dates, df_PDOrm, label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i], linewidth=1, marker=None) list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm) ax.legend() filepath = os.path.join(path_out_main, 'Low-pass_filter.pdf') plt.savefig(filepath, bbox_inches='tight') df_PDOs = pd.concat(list_dfPDO,axis=1) functions_pp.store_hdf_df({'df_data':df_PDOs}, file_path=filepath_df_PDOs) #%% Get ENSO 3.4 index if 'parcorrENSO' == exper: try: df_ENSOs = functions_pp.load_hdf5(filepath_df_ENSO)['df_data'] except: SST_pp_filepath = user_dir + '/surfdrive/ERA5/input_raw/preprocessed/sst_1979-2020_jan_dec_monthly_1.0deg.nc' if 'df_PDOsplit' not in globals(): df_ENSO, ENSO_years, ENSO_cycle = climate_indices.ENSO_34(SST_pp_filepath) df_ENSO = (df_ENSO - df_ENSO.mean()) / df_ENSO.std() # Butter Lowpass dates = df_ENSO.index
df_PDO, PDO_patterns = climate_indices.PDO(filepath, df_splits) df_data_lag = df_PDO.merge(df_data_lag, left_index=True, right_index=True) print('calculating ENSO') df_ENSO_34 = climate_indices.ENSO_34(filepath, df_splits) df_data_lag = df_ENSO_34.merge(df_data_lag, left_index=True, right_index=True) df_data_lag = add_RV(df_data_lag, RV) dict_of_dfs = {'df_data': df_data_lag} fname = '{}_{}_lag_{}_{}.h5'.format(ex['datafolder'], today, lag, ex['hash']) file_path = os.path.join(ex['path_data_out'], fname) functions_pp.store_hdf_df(dict_of_dfs, file_path) #actor.ts_corr[ex['RV_name']] = pd.Series(RV.RVfullts.values, index=actor.ts_corr[0].index) central_lon_plots = 200 map_proj = ccrs.LambertCylindrical(central_longitude=central_lon_plots) kwrgs_corr = {'clim': (-0.5, 0.5), 'hspace': -0.6} pdfs_folder = os.path.join(ex['path_fig'], 'pdfs') if os.path.isdir(pdfs_folder) != True: os.makedirs(pdfs_folder) f_format = '.png' #lags_plot = [0, 20, 50] lags_to_plot = lags_i contour_mask = (CPPA_prec['prec_labels'] > 0).sel( lag=lags_to_plot).astype(bool) plot_maps.plot_corr_maps(CPPA_prec.sel(lag=lags_to_plot), contour_mask, map_proj, **kwrgs_corr)
def loop_analysis(agg_level, n_lags, kwrgs_MI, fold_method, n_jobs, distinct_cl=None, distinct_targetperiods=None): #%% # distinct_cl = cluster_numbers; distinct_targetperiods = TV_targetperiod #retrieve number of clusters with aggregation level if distinct_cl is None: ncl_dict = {'high': 20, 'medium': 42, 'low': 135} ncl = ncl_dict['{}'.format(agg_level)] cl_list = list(range(1, ncl + 1)) else: cl_list = distinct_cl subfolder = f'{agg_level}_{fold_method}' #target periods, all or given all_targetperiods = [('01-01', '01-31'), ('02-01', '02-28'), ('03-01', '03-31'), ('04-01', '04-30'), ('05-01', '05-31'), ('06-01', '06-30'), ('07-01', '07-31'), ('08-01', '08-31'), ('09-01', '09-30'), ('10-01', '10-31'), ('11-01', '11-30'), ('12-01', '12-31')] if distinct_targetperiods is None: targetperiods = all_targetperiods else: targetperiods = distinct_targetperiods #create dictionary of periods and month names all_targetperiods_names_list = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] zip_iterator = zip(all_targetperiods, all_targetperiods_names_list) all_targetperiods_dict = dict(zip_iterator) #create indices for multi index result dataframe row_idx_1_arr = np.array([val for val in cl_list for _ in range(6) ]) # 1 1 1 1 1 1 2 2 2 2 2 2 ... row_idx_2 = ['test', 'train'] row_idx_2_list = [val for val in row_idx_2 for _ in range(3)] # test test test train train train if len(cl_list) > 1: row_idx_2_list += ( len(cl_list) - 1 ) * row_idx_2_list # test test test train train train test test test train ... row_idx_2_arr = np.array(row_idx_2_list) row_idx_3_list = [ 'RMSE_SS', 'MAE_SS', 'corrcoef', 'RMSE_SS', 'MAE_SS', 'corrcoef' ] #RMSE MAE corrcoef RMSE MAE corrcoef if len(cl_list) > 1: row_idx_3_list += ( len(cl_list) - 1 ) * row_idx_3_list #RMSE MAE corrcoef RMSE MAE corrcoef RMSE MAE ... row_idx_3_arr = np.array(row_idx_3_list) row_arrays = [row_idx_1_arr, row_idx_2_arr, row_idx_3_arr ] #cluster, test/train, scores as row multi index column_array = [all_targetperiods_dict[x] for x in targetperiods] #month names as column index #initiate zeros ss_result dataframe, rows = months, columns = scores per cluster df_ss_result_all = pd.DataFrame(np.zeros( (len(row_arrays[0]), len(column_array)), dtype=float), index=row_arrays, columns=column_array) #initiate zeros prediction_result dataframe, rows = not yet known, columns = target time series & prediction df_prediction_result = pd.DataFrame() #parallel function def parallel(cluster, month, agg_level, n_lags, kwrgs_MI, fold_method, row_arrays, column_array, subfolder): #%% print(f'Starting cluster {cluster}, prediciting {month}') #get list_of_name_path list_of_name_path = get_list_of_name_path(agg_level, cluster) #run define rg, list_for_MI, lags, crossyr = define(list_of_name_path, month, n_lags, kwrgs_MI, subfolder) #run check (possible, not necessary) #check(rg, list_of_name_path, cluster) #run processing rg = process(rg, lags, fold_method, crossyr) #run forecast test_scores, train_scores, prediction = forecast(rg, crossyr) #store skill score results in df_ss_result dataframe df_ss_result = pd.DataFrame(np.zeros( (len(row_arrays[0]), len(column_array)), dtype=float), index=row_arrays, columns=column_array) for count, i in enumerate( row_idx_2_arr[:6] ): #always loop over test test test train train train per cluster if count < 3: df_ss_result.loc[ (cluster, i, row_idx_3_arr[count]), all_targetperiods_dict[month]] = test_scores[count] else: df_ss_result.loc[(cluster, i, row_idx_3_arr[count]), all_targetperiods_dict[month]] = train_scores[ count - 3] #get test df actual and predictions test_df_pred = functions_pp.get_df_test(prediction, df_splits=pd.DataFrame( rg.df_data.iloc[:, -2:])) #update dates delta = int(month[0][:2]) - 1 date_list = test_df_pred.index.get_level_values(0).shift(delta, freq='MS') test_df_pred.set_index([date_list], inplace=True) #change column header of prediction to RV#ts_pred new_columns = test_df_pred.columns.values new_columns[1] = new_columns[0] + '_pred' test_df_pred.columns = new_columns #save intermediate cluster csv results_path = os.path.join( main_dir, 'Results', 'skillscores', f'{agg_level}_{fold_method}') #path of results os.makedirs(results_path, exist_ok=True) # make folder if it doesn't exist df_ss_result.to_csv( os.path.join( results_path, str(cluster) + '_' + str(all_targetperiods_dict[month]) + '_ss_scores_' + agg_level + '.csv')) #intermediate save skillscores per cluster to csv #%% return df_ss_result, test_df_pred, rg with joblib.parallel_backend('loky'): results = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed( parallel)(cluster, month, agg_level, n_lags, kwrgs_MI, fold_method, row_arrays, column_array, subfolder) for cluster in cl_list for month in targetperiods) #append all results to one dataframe - skill scores for result in results: df_ss_result_all = df_ss_result_all + result[0].values #append all results to one dataframe - predictions for result_counter, result in enumerate(results): if result_counter <= len(targetperiods) - 1: df_prediction_result = df_prediction_result.append(result[1]) elif result_counter > len(targetperiods) - 1 and result_counter % len( targetperiods) == 0: df_prediction_result = df_prediction_result.join(result[1], how='left') else: df_prediction_result.update(result[1], join='left') #return one rg rg = results[-1][-1] results_path = os.path.join(main_dir, 'Results', 'skillscores', f'{agg_level}_{fold_method}') #path of results functions_pp.store_hdf_df( { 'df_ss_result': df_ss_result_all, 'df_prediction_result': df_prediction_result }, file_path=os.path.join(results_path, 'df_skill_predictions.h5')) #return df_ss_result dataframe and prediction #%% return df_ss_result_all, df_prediction_result, rg
plt.savefig(filepath + '.png', dpi=200, bbox_inches='tight') df_ana.plot_ts_matric(df_data, win=30, columns=cols, period='RV_mask', plot_sign_stars=False, fontsizescaler=-8) filepath = os.path.join( rg.path_outsub1, '30d_z500_' + '-'.join(map(str, z500_green_bb)) + rg.hash) plt.savefig(filepath + '.png', dpi=200, bbox_inches='tight') filepath = os.path.join( path_out_main, 'z500_' + '-'.join(map(str, z500_green_bb)) + rg.hash) functions_pp.store_hdf_df({'df_data': df_data}, filepath + '.h5') #%% SST vs T list_of_name_path = [(cluster_label, TVpath), ('sst', os.path.join(path_raw, 'sst_1979-2020_1_12_daily_1.0deg.nc'))] lags = np.array([0, 2]) list_for_MI = [ BivariateMI(name='sst', func=class_BivariateMI.corr_map, alpha=.05, FDR_control=True, lags=lags,
df_train_m, df_test_s_m, df_test_m, df_boot = verification_tuple m = models_lags[f'lag_{lag_}'][f'split_{0}'] # plt.plot(kwrgs_model['alpha'], m.cv_results_['mean_test_score']) # plt.axvline(m.best_params_['alpha']) ; plt.show() ; plt.close() list_verification.append(verification_tuple) rg.verification_tuple = verification_tuple #%% Plotting Continuous forecast df_preds_save = utils_paper3.df_predictions_for_plot(rg_list) d_dfs = {'df_predictions': df_preds_save} filepath_dfs = os.path.join(rg.path_outsub1, f'predictions_s{seed}_continuous.h5') functions_pp.store_hdf_df(d_dfs, filepath_dfs) df_scores, df_boot, df_tests = utils_paper3.df_scores_for_plot( rg_list, name_object='verification_tuple') d_dfs = {'df_scores': df_scores, 'df_boot': df_boot, 'df_tests': df_tests} filepath_dfs = os.path.join(rg.path_outsub1, f'scores_s{seed}_continuous.h5') functions_pp.store_hdf_df(d_dfs, filepath_dfs) d_dfs = functions_pp.load_hdf5(filepath_dfs) f = utils_paper3.plot_scores_wrapper(df_scores, df_boot) f_name = f'{method}_{seed}_cf_PacAtl' fig_path = os.path.join(rg.path_outsub1, f_name) + rg.figext if save: f.savefig(fig_path, bbox_inches='tight')