def test_insert_leap_year_value(self): """interpolation for the 12.31 data in leap year""" data_dir = os.path.join(self.config_data.data_path["DB"], "basin_mean_forcing", "daymet") subdir_str = [ "01", "02", "03", "04", "05", "06", "07", "08", "09", "10L", "10U", "11", "12", "13", "14", "15", "16", "17", "18" ] t_range = ["1980-01-01", "2020-01-01"] col_lst = [ "dayl(s)", "prcp(mm/day)", "srad(W/m2)", "swe(mm)", "tmax(C)", "tmin(C)", "vp(Pa)" ] for i in range(len(subdir_str)): subdir = os.path.join(data_dir, subdir_str[i]) path_list = os.listdir(subdir) path_list.sort() # 对读取的路径进行排序 for filename in path_list: data_file = os.path.join(subdir, filename) is_leap_file_name = data_file[-8:] if "leap" in is_leap_file_name: continue print("reading", data_file) data_temp = pd.read_csv(data_file, sep=r'\s+') data_temp.rename(columns={'Mnth': 'Month'}, inplace=True) df_date = data_temp[['Year', 'Month', 'Day']] date = pd.to_datetime(df_date).values.astype('datetime64[D]') # daymet file not for leap year, there is no data in 12.31 in leap year assert (all(x < y for x, y in zip(date, date[1:]))) t_range_list = hydro_time.t_range_days(t_range) [c, ind1, ind2] = np.intersect1d(date, t_range_list, return_indices=True) assert date[0] <= t_range_list[0] and date[-1] >= t_range_list[ -1] nt = t_range_list.size out = np.full([nt, 7], np.nan) out[ind2, :] = data_temp[col_lst].values[ind1] x = pd.DataFrame(out, columns=col_lst) x_intepolate = x.interpolate(method='linear', limit_direction='forward', axis=0) csv_date = pd.to_datetime(t_range_list) year_month_day_hour = pd.DataFrame( [[dt.year, dt.month, dt.day, dt.hour] for dt in csv_date], columns=['Year', 'Mnth', 'Day', "Hr"]) # concat new_data_df = pd.concat([year_month_day_hour, x_intepolate], axis=1) output_file = data_file[:-4] + "_leap.txt" new_data_df.to_csv(output_file, header=True, index=False, sep=' ', float_format='%.2f') os.remove(data_file)
def test_check_streamflow_data(self): source_data = GagesSource( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False) t_range_list = hydro_time.t_range_days(["1990-01-01", "2010-01-01"]) # data_temp = source_data.read_usge_gage("01", '01052500', t_range_list) data_temp = source_data.read_usge_gage("08", '08013000', t_range_list) print(data_temp) print(np.argwhere(np.isnan(data_temp)))
def data_models_of_train_test(cls, data_model, t_train, t_test): """split the data_model that will be used in LSTM according to train and test Notice: you can't know anything about test dataset before evaluating, so we should use the statistic value of training period for normalization in test period""" def select_by_time(data_flow_temp, data_forcing_temp, data_model_origin, t_temp, train_stat_dict=None): data_attr_temp = data_model_origin.data_attr[:, :] stat_dict_temp = {} t_s_dict_temp = {} source_data_temp = copy.deepcopy(data_model_origin.data_source) source_data_temp.t_range = t_temp f_dict_temp = data_model_origin.f_dict var_dict_temp = data_model_origin.var_dict data_model_temp = cls(source_data_temp, data_flow_temp, data_forcing_temp, data_attr_temp, var_dict_temp, f_dict_temp, stat_dict_temp, t_s_dict_temp) t_s_dict_temp['sites_id'] = data_model_origin.t_s_dict['sites_id'] t_s_dict_temp['t_final_range'] = t_temp data_model_temp.t_s_dict = t_s_dict_temp if train_stat_dict is None: stat_dict_temp = data_model_temp.cal_stat_all() else: stat_dict_temp = train_stat_dict data_model_temp.stat_dict = stat_dict_temp return data_model_temp t_lst_train = hydro_time.t_range_days(t_train) t_train_final_index = t_lst_train.size data_flow_train = data_model.data_flow[:, :t_train_final_index] data_forcing_train = data_model.data_forcing[:, : t_train_final_index, :] data_model_train = select_by_time(data_flow_train, data_forcing_train, data_model, t_train) data_flow_test = data_model.data_flow[:, t_train_final_index:] data_forcing_test = data_model.data_forcing[:, t_train_final_index:, :] data_model_test = select_by_time(data_flow_test, data_forcing_test, data_model, t_test, data_model_train.stat_dict) return data_model_train, data_model_test
def plot_gages_map_and_ts(data_model, obs, pred, inds_df, show_ind_key, idx_lst, pertile_range, plot_ts=True, fig_size=(8, 8), cmap_str="viridis"): data_map = (inds_df.loc[idx_lst])[show_ind_key].values all_lat = data_model.data_source.gage_dict["LAT_GAGE"] all_lon = data_model.data_source.gage_dict["LNG_GAGE"] all_sites_id = data_model.data_source.gage_dict["STAID"] sites = np.array(data_model.t_s_dict['sites_id'])[idx_lst] sites_index = np.array([np.where(all_sites_id == i) for i in sites]).flatten() lat = all_lat[sites_index] lon = all_lon[sites_index] data_ts_obs_np = obs[idx_lst, :] data_ts_pred_np = pred[idx_lst, :] data_ts = [[data_ts_obs_np[i], data_ts_pred_np[i]] for i in range(data_ts_obs_np.shape[0])] t = hydro_time.t_range_days(data_model.t_s_dict["t_final_range"]).tolist() if plot_ts: plot_ts_map(data_map.tolist(), data_ts, lat, lon, t, sites.tolist(), pertile_range=pertile_range) else: f = plot_map_carto(data_map, lat=lat, lon=lon, pertile_range=pertile_range, fig_size=(fig_size[0], fig_size[1] - 2), cmap_str=cmap_str) return f
def test_test_gages(self): data_model_origin = GagesModel.load_datamodel( self.config_data.data_path["Temp"], data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') warmup_len = 120 t_range_all = data_model_origin.t_s_dict["t_final_range"] t_range_lst = hydro_time.t_range_days(t_range_all) t_range_warmup = hydro_time.t_days_lst2range(t_range_lst[:warmup_len]) t_range_test = hydro_time.t_days_lst2range(t_range_lst[warmup_len:]) data_model_warmup, data_model = GagesModel.data_models_of_train_test( data_model_origin, t_range_warmup, t_range_test) data_model.stat_dict = data_model_origin.stat_dict with torch.cuda.device(0): pred, obs = master_test(data_model, epoch=self.test_epoch) basin_area = data_model.data_source.read_attr( data_model.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = data_model.data_source.read_attr( data_model.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(data_model.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs) plot_we_need(data_model, obs, pred, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def update_data_model(cls, config_data, data_model_origin, sites_id_update=None, t_range_update=None, data_attr_update=False, train_stat_dict=None, screen_basin_area_huc4=False): t_s_dict_origin = data_model_origin.t_s_dict data_flow_origin = data_model_origin.data_flow data_forcing_origin = data_model_origin.data_forcing data_attr_origin = data_model_origin.data_attr var_dict_origin = data_model_origin.var_dict f_dict_origin = data_model_origin.f_dict stat_dict_origin = data_model_origin.stat_dict if sites_id_update is not None: t_s_dict = {} t_range_origin_cpy = t_s_dict_origin["t_final_range"].copy() sites_id_origin_cpy = t_s_dict_origin["sites_id"].copy() sites_id_new = sites_id_update assert (all( x < y for x, y in zip(sites_id_origin_cpy, sites_id_origin_cpy[1:]))) assert (all(x < y for x, y in zip(sites_id_new, sites_id_new[1:]))) sites_id = np.intersect1d(sites_id_origin_cpy, sites_id_new) assert sites_id.size > 0 new_source_data = GagesSource.choose_some_basins( config_data, t_range_origin_cpy, screen_basin_area_huc4=screen_basin_area_huc4, sites_id=sites_id.tolist()) t_s_dict["t_final_range"] = t_range_origin_cpy t_s_dict["sites_id"] = sites_id.tolist() chosen_idx = [ i for i in range(len(sites_id_origin_cpy)) if sites_id_origin_cpy[i] in sites_id ] data_flow = data_flow_origin[chosen_idx, :] data_forcing = data_forcing_origin[chosen_idx, :, :] data_attr = data_attr_origin[chosen_idx, :] else: t_range_origin_cpy = t_s_dict_origin["t_final_range"].copy() t_s_dict = copy.deepcopy(t_s_dict_origin) new_source_data = GagesSource.choose_some_basins( config_data, t_range_origin_cpy, screen_basin_area_huc4=screen_basin_area_huc4) data_flow = data_flow_origin.copy() data_forcing = data_forcing_origin.copy() data_attr = data_attr_origin.copy() if data_attr_update: attr_lst = new_source_data.all_configs.get("attr_chosen") data_attr, var_dict, f_dict = new_source_data.read_attr( t_s_dict["sites_id"], attr_lst) else: var_dict = var_dict_origin.copy() f_dict = f_dict_origin.copy() data_model = cls(new_source_data, data_flow, data_forcing, data_attr, var_dict, f_dict, stat_dict_origin, t_s_dict) if t_range_update is not None: sites_id_temp = data_model.t_s_dict['sites_id'].copy() t_range = t_range_update.copy() stat_dict_temp = {} t_s_dict_temp = {} start_index = int( (np.datetime64(t_range[0]) - np.datetime64(data_model.t_s_dict["t_final_range"][0])) / np.timedelta64(1, 'D')) assert start_index >= 0 t_lst_temp = hydro_time.t_range_days(t_range) end_index = start_index + t_lst_temp.size data_flow = data_model.data_flow[:, start_index:end_index] data_forcing = data_model.data_forcing[:, start_index:end_index, :] data_model = cls(new_source_data, data_flow, data_forcing, data_attr, var_dict, f_dict, stat_dict_temp, t_s_dict_temp) t_s_dict_temp['sites_id'] = sites_id_temp t_s_dict_temp['t_final_range'] = t_range data_model.t_s_dict = t_s_dict_temp data_model.data_source.t_range = t_range if not data_model.data_source.gage_dict["STAID"].tolist( ) == data_model.t_s_dict['sites_id']: gage_dict_new = dict() usgs_all_sites = data_model.data_source.gage_dict["STAID"] sites_chosen = np.zeros(usgs_all_sites.shape[0]) usgs_ids = data_model.t_s_dict['sites_id'] sites_index = np.where(np.in1d(usgs_all_sites, usgs_ids))[0] sites_chosen[sites_index] = 1 for key, value in data_model.data_source.gage_dict.items(): value_new = np.array([ value[i] for i in range(len(sites_chosen)) if sites_chosen[i] > 0 ]) gage_dict_new[key] = value_new data_model.data_source.gage_dict = gage_dict_new assert (np.array(usgs_ids) == gage_dict_new["STAID"]).all() if train_stat_dict is None: stat_dict_temp = data_model.cal_stat_all() else: stat_dict_temp = train_stat_dict data_model.stat_dict = stat_dict_temp return data_model
def usgs_screen_streamflow(self, streamflow, usgs_ids=None, time_range=None): usgs_out = None gages_chosen_id = self.gage_dict["HUC10"] ts = hydro_time.t_range_days(self.t_range) return usgs_out, gages_chosen_id, ts
def test_t_range_days(self): t_range = self.t_range t_lst = t_range_days(t_range) print(t_lst)