in_df_rea2 = pd.read_csv(pcp_Rea[0], sep=';', index_col=0, parse_dates=True, infer_datetime_format=True) in_df_rea2[in_df_rea2 < 0] = 0 # read data and get station ids and coords empty_df_rea2 = pd.DataFrame(index=in_df_rea2.index, columns=in_df_rea2.columns, data=np.full(shape=(in_df_rea2.shape), fill_value=np.nan)) dwd_pcp = dwd_hdf5.get_pandas_dataframe_between_dates( dwd_ids, event_start=start_year, event_end=end_year) dwd_pcp_hourly = resampleDf(dwd_pcp, 'H') # for temp_agg in (aggs): # print(temp_agg) # dwd_pcp_res = resampleDf(dwd_pcp, temp_agg) # in_df_rea2_res = resampleDf(in_df_rea2, temp_agg) for _ii in tqdm.tqdm(range(len(dwd_ids))): # print(_ii, '/', len(dwd_ids)) stn_id = dwd_ids[_ii] # dwd pcp historical print(stn_id) dwd_pcp_hist = dwd_hdf5.get_pandas_dataframe(stn_id) dwd_pcp_hist_hourly = resampleDf(dwd_pcp_hist, 'H').dropna(how='any') #dwd_pcp_hist_hourly df_dwd1 = dwd_pcp_hourly.loc[:, stn_id].dropna(how='any')
# infer_datetime_format=True) in_df_rea2_qq = pd.read_csv(pcp_Rea[0], sep=';', index_col=0, parse_dates=True, infer_datetime_format=True) in_df_rea2 = in_df_rea2_qq # in_df_rea2.iloc[:,-1] # in_df_rea2_qq.iloc[:,-1] # read data and get station ids and coords dwd_pcp = dwd_hdf5.get_pandas_dataframe_between_dates( dwd_ids, event_start=start_year, event_end=end_year) for temp_agg, percentile_level in zip(aggs, percentile_levels): print(temp_agg) dwd_pcp_res = resampleDf(dwd_pcp, temp_agg) in_df_rea2_res = resampleDf(in_df_rea2, temp_agg) df_distance_corr = pd.DataFrame(index=dwd_ids) for _ii in tqdm.tqdm(range(len(dwd_ids))): # print(_ii, '/', len(dwd_ids)) stn_id = dwd_ids[_ii] x_dwd_interpolate = dwd_coords_utm32.loc[stn_id, 'X'] y_dwd_interpolate = dwd_coords_utm32.loc[stn_id, 'Y'] # drop stns all_dwd_stns_except_interp_loc = [ stn for stn in dwd_ids if stn != stn_id and len(stn) > 0 ]
for temp_agg in aggs: print(temp_agg) df_auto_corr_dwd = pd.DataFrame(index=dwd_ids_hannover, columns=x_ticks) df_auto_corr_rea = pd.DataFrame(index=dwd_ids_hannover, columns=x_ticks) for stn_id in tqdm.tqdm(dwd_ids_hannover): # print(_ii, '/', len(dwd_ids)) dwd_pcp = dwd_hdf5_de.get_pandas_dataframe(stn_id).dropna() in_df_rea6_stn = in_df_rea6.loc[:, stn_id].dropna() cmn_idx = dwd_pcp.index.intersection(in_df_rea6_stn.index) #break df_dwd1 = resampleDf(dwd_pcp.loc[cmn_idx, :], temp_agg) df_rea1 = resampleDf(in_df_rea6_stn.loc[cmn_idx], temp_agg, closed='right', label='left') # Yield normalized autocorrelation function of number lags autocorr_dwd = stattools.acf(df_dwd1, nlags=12, fft=True) autocorr_rea = stattools.acf(df_rea1.values.ravel(), nlags=12, fft=True) df_auto_corr_dwd.loc[stn_id, :] = autocorr_dwd df_auto_corr_rea.loc[stn_id, :] = autocorr_rea # all stns
pcp_Rea = [df for df in all_grib_files if str(_year) in df] in_df_rea2 = pd.read_csv( pcp_Rea[0], sep=';', index_col=0, # parse_dates=True, # infer_datetime_format=True, engine='c') in_df_rea2.index = pd.to_datetime(in_df_rea2.index, format='%Y-%m-%d %H:%M:%S') # read data and get station ids and coords dwd_pcp = dwd_hdf5.get_pandas_dataframe_between_dates( dwd_ids, event_start=start_year, event_end=end_year) dwd_pcp_hourly = resampleDf(dwd_pcp, 'H') # for _ii in range(len(dwd_ids[:50])): # cmn_idx = dwd_pcp_hourly.iloc[:, _ii].dropna().index.intersection( # in_df_rea2.iloc[:, _ii].dropna().index) # # plt.ioff() # plt.figure(figsize=(12, 8), dpi=100) # plt.plot(range(len(cmn_idx)), # np.cumsum(in_df_rea2.loc[ # cmn_idx, dwd_ids[_ii]].dropna().values), alpha=0.5, # c='b', label='REA2') # # plt.plot(range(len(cmn_idx)), # np.cumsum(dwd_pcp_hourly.loc[ # cmn_idx, dwd_ids[_ii]].values), alpha=0.95,
all_dwd_stns_except_interp_loc) # coords of all other stns for event x_dwd_all = dwd_coords_utm32.loc[cmn_stns, 'X'].dropna().values y_dwd_all = dwd_coords_utm32.loc[cmn_stns, 'Y'].dropna().values # coords of neighbors dwd_neighbors_coords = np.c_[x_dwd_all.ravel(), y_dwd_all.ravel()] distrance_to_ngbrs = distance.cdist( [(x_dwd_interpolate, y_dwd_interpolate)], dwd_neighbors_coords, 'euclidean')[0] distance_sorted = np.sort(distrance_to_ngbrs) ids_sorted = cmn_stns[np.argsort(distrance_to_ngbrs)] # calc rank correlation df_dwd1 = resampleDf(dwd_pcp.loc[cmn_idx, :], temp_agg) df_rea1 = resampleDf(in_df_rea6_stn.loc[cmn_idx], temp_agg) if df_dwd1.size > 0: if test_for_extremes: df_dwd1 = transform_to_bools(df_dwd1, percentile_level) df_rea1 = transform_to_bools(df_rea1, percentile_level) df_dwd1.max() for ix2, _id2 in enumerate(ids_sorted): # print(ix2, '/', len(ids_sorted)) df_dwd2 = dwd_hdf5_de.get_pandas_dataframe(_id2).dropna( how='any') df_dwd2 = resampleDf(df_dwd2, temp_agg) df_rea2 = in_df_rea6.loc[:, _id2].dropna(how='any') df_rea2[df_rea2 < 0] = 0 # break
print('Raeding rea6 data') in_df_rea6 = pd.read_csv(path_to_rea6_files, sep=';', index_col=0, engine='c') in_df_rea6.index = time_range in_df_rea6 = in_df_rea6.round(2) #pd.to_datetime(in_df_rea6.index, format='%Y-%m-%d %H:%M:%S') #in_df_rea6 = in_df_rea6 * 3600 #in_df_rea6.dropna(how='all') print('Raeding rea6 data hourly ') in_df_rea6_hourly = pd.read_csv(path_to_rea6_hourly, sep=';', index_col=0, engine='c') in_df_rea6_hourly.index = pd.to_datetime(in_df_rea6_hourly.index, format='%Y-%m-%d %H:%M:%S') in_df_rea6_hourly_daily = resampleDf(in_df_rea6_hourly, 'D', closed='right', label='left') in_df_rea6_hourly_daily = in_df_rea6_hourly_daily.round(2) cmn_idx_hourly_daily = in_df_rea6_hourly_daily.index.intersection(in_df_rea6.index) in_df_rea6_hourly_daily.loc[cmn_idx_hourly_daily,:].hist() df_diff = (in_df_rea6_hourly_daily.loc[cmn_idx_hourly_daily,:]- in_df_rea6.loc[cmn_idx_hourly_daily, :]) df_diff.max().plot() plt.ioff() plt.figure() plt.plot([0, 50],[0,50], c='grey') plt.scatter(in_df_rea6_hourly_daily.loc[cmn_idx_hourly_daily,:].values, in_df_rea6.loc[cmn_idx_hourly_daily, :].values) plt.show()
path_dwd_data = (r"/home/abbas/Documents/REA2" r"/dwd_comb_5min_data_agg_5min_2020_flagged_Hannover.h5") dwd_hdf5 = HDF5(infile=path_dwd_data) dwd_ids = dwd_hdf5.get_all_names() dwd_coords_utm32 = pd.DataFrame( index=dwd_ids, data=dwd_hdf5.get_coordinates(dwd_ids)['easting'], columns=['X']) dwd_coords_utm32['Y'] = dwd_hdf5.get_coordinates(dwd_ids)['northing'] # transform to boolean dwd_pcp = dwd_hdf5.get_pandas_dataframe(dwd_ids[0]) dwd_pcp_daily = resampleDf(dwd_pcp, '1440min') thr = 1 dwd_pcp_daily_boolean = dwd_pcp_daily.copy() dwd_pcp_daily_boolean[dwd_pcp_daily_boolean >= thr] = 1 dwd_pcp_daily_boolean[dwd_pcp_daily_boolean < thr] = 0 dwd_pcp_daily_boolean_test = dwd_pcp_daily_boolean.values[:6].ravel().astype( int) # Functionally: num = str(''.join(map(str, dwd_pcp_daily_boolean_test))) decimal = 0