def load_datamodel(cls, dir_temp_orgin, num_str=None, **kwargs): if num_str: dir_temp = os.path.join(dir_temp_orgin, num_str) else: dir_temp = dir_temp_orgin data_source_file = os.path.join(dir_temp, kwargs['data_source_file_name']) stat_file = os.path.join(dir_temp, kwargs['stat_file_name']) flow_npy_file = os.path.join(dir_temp, kwargs['flow_file_name']) forcing_npy_file = os.path.join(dir_temp, kwargs['forcing_file_name']) attr_npy_file = os.path.join(dir_temp, kwargs['attr_file_name']) f_dict_file = os.path.join(dir_temp, kwargs['f_dict_file_name']) var_dict_file = os.path.join(dir_temp, kwargs['var_dict_file_name']) t_s_dict_file = os.path.join(dir_temp, kwargs['t_s_dict_file_name']) source_data = unserialize_pickle(data_source_file) # save data_model because of the low speed of serialization of data_model: dict -> json,data -> npy stat_dict = unserialize_json(stat_file) data_flow = unserialize_numpy(flow_npy_file) data_forcing = unserialize_numpy(forcing_npy_file) data_attr = unserialize_numpy(attr_npy_file) # dictFactorize.json is the explanation of value of categorical variables var_dict = unserialize_json(var_dict_file) f_dict = unserialize_json(f_dict_file) t_s_dict = unserialize_json(t_s_dict_file) data_model = cls(source_data, data_flow, data_forcing, data_attr, var_dict, f_dict, stat_dict, t_s_dict) return data_model
def test_plot_map(self): data_model = GagesModel.load_datamodel( self.dir_temp, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gauge_dict = data_model.data_source.gage_dict t_s_dict = unserialize_json(self.t_s_dict_file) sites = np.array(t_s_dict["sites_id"]) keys = ["NSE"] inds_test = subset_of_dict(self.inds, keys) sites_df = pd.DataFrame({"sites": sites, keys[0]: inds_test[keys[0]]}) nse_range = [0, 1] idx_lstl_nse = sites_df[(sites_df[keys[0]] >= nse_range[0]) & ( sites_df[keys[0]] <= nse_range[1])].index.tolist() colorbar_size = [0.91, 0.323, 0.02, 0.346] # colorbar_size = None plot_gages_map(data_model, sites_df, keys[0], idx_lstl_nse, colorbar_size=colorbar_size, cbar_font_size=14) plt.savefig(os.path.join(self.dir_out, 'map_NSE.png'), dpi=500, bbox_inches="tight") plt.show()
def test_dam_train(self): with torch.cuda.device(0): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_8595) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_train, nid_input, True, gage_main_dam_purpose) gages_input = choose_which_purpose(data_input) master_train(gages_input)
def test_dam_train(self): """just test for one purpose as a case""" with torch.cuda.device(2): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") df = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(df, nid_input, True, gage_main_dam_purpose) purpose_chosen = 'C' gages_input = choose_which_purpose(data_input, purpose=purpose_chosen) master_train(gages_input)
def test_plot_cases(self): nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) for i in range(0, gage_main_dam_purpose_unique.size): data_model = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') new_temp_dir = os.path.join(data_model.data_source.data_config.model_dict["dir"]["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join(data_model.data_source.data_config.model_dict["dir"]["Out"], gage_main_dam_purpose_unique[i]) data_model.update_datamodel_dir(new_temp_dir, new_out_dir) pred, obs = load_result(new_temp_dir, self.test_epoch) pred = pred.reshape(pred.shape[0], pred.shape[1]) obs = obs.reshape(pred.shape[0], pred.shape[1]) inds = statError(obs, pred) inds_df = pd.DataFrame(inds) print(gage_main_dam_purpose_unique[i]) print(inds_df.median(axis=0)) print(inds_df.mean(axis=0))
def test_data_temp_test_damcls(self): with torch.cuda.device(0): nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) for i in range(0, gage_main_dam_purpose_unique.size): df = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') new_temp_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Out"], gage_main_dam_purpose_unique[i]) df.update_datamodel_dir(new_temp_dir, new_out_dir) pred, obs = master_test(df, epoch=self.test_epoch) basin_area = df.data_source.read_attr(df.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = df.data_source.read_attr(df.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(new_temp_dir, self.test_epoch, pred, obs)
def test_plot_ts(self): """测试可视化代码""" # plot time series show_me_num = 5 t_s_dict = unserialize_json(self.t_s_dict_file) sites = np.array(t_s_dict["sites_id"]) t_range = np.array(t_s_dict["t_final_range"]) plot_ts_obs_pred(self.obs, self.pred, sites, t_range, show_me_num)
def test_plot_ind_map(self): """plot nse value on map""" t_s_dict = unserialize_json(self.t_s_dict_file) sites = np.array(t_s_dict["sites_id"]) keys = ["NSE"] inds_test = subset_of_dict(self.inds, keys) # concat sites and inds sites_df = pd.DataFrame({"sites": sites, keys[0]: inds_test[keys[0]]}) plot_ind_map(self.gage_point_file, sites_df)
def test_plot_kuai_cdf(self): t_s_dict = unserialize_json(self.t_s_dict_file) sites = np.array(t_s_dict["sites_id"]) keys = ["NSE"] inds_test = subset_of_dict(self.inds, keys) plotCDF([inds_test[keys[0]]], ref=None, legendLst=["LSTM"], linespec=['-', '-', ':', ':', ':'])
def test_plot_pdf_cdf(self): t_s_dict = unserialize_json(self.t_s_dict_file) sites = np.array(t_s_dict["sites_id"]) keys = ["NSE"] inds_test = subset_of_dict(self.inds, keys) x = pd.DataFrame(inds_test) # x = inds_test[keys[0]] # plot_dist(x) plot_pdf_cdf(x, keys[0])
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_test, nid_input, True, gage_main_dam_purpose) gages_input = choose_which_purpose(data_input) pred, obs = master_test(gages_input) basin_area = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(gages_input.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs)
def test_forecast(self): source_data = unserialize_pickle(self.data_source_test_file) # 存储data_model,因为data_model里的数据如果直接序列化会比较慢,所以各部分分别序列化,dict的直接序列化为json文件,数据的HDF5 stat_dict = unserialize_json(self.stat_file) data_flow = unserialize_numpy(self.flow_npy_file) data_forcing = unserialize_numpy(self.forcing_npy_file) data_attr = unserialize_numpy(self.attr_npy_file) # dictFactorize.json is the explanation of value of categorical variables var_dict = unserialize_json(self.var_dict_file) f_dict = unserialize_json(self.f_dict_file) t_s_dict = unserialize_json(self.t_s_dict_file) data_model_test = DataModel(source_data, data_flow, data_forcing, data_attr, var_dict, f_dict, stat_dict, t_s_dict) pred, obs = hydroDL.master_test(data_model_test) print(pred) print(obs) serialize_numpy(pred, self.flow_pred_file) serialize_numpy(obs, self.flow_obs_file)
def test_purposes_inds(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref-dam_95-05_nan-0.1_00-1.0") data_model = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_data_model = GagesModel.update_data_model( self.config_data, data_model) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if value == gage_main_dam_purpose_unique[i]: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] df_id_region = np.array(gages_data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) preds, obss, inds_dfs = split_results_to_regions( gages_data_model, self.test_epoch, id_regions_idx, id_regions_sites_ids) region_names = list(purpose_regions.keys()) inds_medians = [] inds_means = [] for i in range(len(region_names)): inds_medians.append(inds_dfs[i].median(axis=0)) inds_means.append(inds_dfs[i].mean(axis=0)) print(inds_medians) print(inds_means)
def test_dam_train(self): quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"] gages_model1_train = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_train, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, t_range_update=t_range2_train, data_attr_update=True) nid_dir = os.path.join( "/".join(self.config_data_1.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) with torch.cuda.device(1): for i in range(0, gage_main_dam_purpose_unique.size): data_input1 = GagesDamDataModel(gages_model1_train, nid_input, True, gage_main_dam_purpose) gages_input1 = choose_which_purpose( data_input1, purpose=gage_main_dam_purpose_unique[i]) data_input2 = GagesDamDataModel(gages_model2_train, nid_input, True, gage_main_dam_purpose) gages_input2 = choose_which_purpose( data_input2, purpose=gage_main_dam_purpose_unique[i]) data_model = GagesInvDataModel(gages_input1, gages_input2) # pre_trained_model_epoch = 165 train_lstm_inv(data_model)
def test_gages_dam_all_save(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train) data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "test") nid_input = NidModel.load_nidmodel( nid_dir, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_test, nid_input, gage_main_dam_purpose) data_model_dam = choose_which_purpose(data_input) save_datamodel(data_model_dam, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json')
def test_explore_damcls_datamodel(self): config_data = self.config_data sites_id_dict = unserialize_json( "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/data/gages/nid/test/dam_main_purpose_dict.json") sites_id = list(sites_id_dict.keys()) source_data_dor1 = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, sites_id=sites_id) norsto = source_data_dor1.read_attr(sites_id, ["STOR_NOR_2009"], is_return_dict=False) df = pd.DataFrame({"GAGE_ID": sites_id, "STOR_NOR": norsto.flatten()}) # df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '3557basins_NORSTOR.csv'), # quoting=csv.QUOTE_NONNUMERIC, index=None) df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '2909basins_NORSTOR.csv'), quoting=csv.QUOTE_NONNUMERIC, index=None)
def test_explore_(self): config_data = self.config_data sites_id_dict = unserialize_json( "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/data/gages/nid/test/dam_main_purpose_dict.json") sites_id = list(sites_id_dict.keys()) source_data_dor1 = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, sites_id=sites_id) nse_all = pd.read_csv( "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/output/gages/basic/exp37/3557basins_ID_NSE_DOR.csv", dtype={0: str}) sites_ids = nse_all["GAUGE ID"].values idx = [i for i in range(len(sites_ids)) if sites_ids[i] in sites_id] df = pd.DataFrame({"GAGE_ID": sites_id, "NSE": nse_all["NSE"].values[idx]}) # df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '3557basins_NORSTOR.csv'), # quoting=csv.QUOTE_NONNUMERIC, index=None) df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '2909basins_NSE.csv'), quoting=csv.QUOTE_NONNUMERIC, index=None)
def test_dam_train(self): with torch.cuda.device(0): nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) for i in range(0, gage_main_dam_purpose_unique.size): df = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i], data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') new_temp_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Out"], gage_main_dam_purpose_unique[i]) df.update_datamodel_dir(new_temp_dir, new_out_dir) master_train(df)
def test_read_sites_id_see_dor(self): exp_lst = ["exp18", "exp19", "exp20", "exp21", "exp22", "exp23"] sub_lst = ["0", "1"] diff_lst = [ "dictTimeSpace.json", "test_dictTimeSpace.json", "test_dictTimeSpace_2.json" ] for exp_str in exp_lst: for sub_str in sub_lst: comp_sites = [] for item in diff_lst: gage_id_file = os.path.join( self.config_data.config_file["ROOT_DIR"], "temp", "gages", "ecoregion", exp_str, sub_str, item) usgs_id = unserialize_json(gage_id_file)["sites_id"] assert (all(x < y for x, y in zip(usgs_id, usgs_id[1:]))) comp_sites.append(usgs_id) # mm/year 1-km grid, megaliters total storage per sq km (1 megaliters = 1,000,000 liters = 1,000 cubic meters) # attr_lst = ["RUNAVE7100", "STOR_NID_2009"] attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] source_data = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, sites_id=usgs_id) data_attr, var_dict, f_dict = source_data.read_attr( usgs_id, attr_lst) run_avg = data_attr[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = data_attr[:, 1] * 1000 # m^3 dors = nor_storage / run_avg results = [round(i, 3) for i in dors] hydro_logger.info( exp_str + "-" + sub_str + "-" + item + " DOR: %s", results) hydro_logger.info( "the intersection of each pair of sites: %s, %s, %s", np.intersect1d(comp_sites[0], comp_sites[1]), np.intersect1d(comp_sites[0], comp_sites[2]), np.intersect1d(comp_sites[1], comp_sites[2]))
def test_damcls_test_datamodel(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_train = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train) df = GagesModel.update_data_model(self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) data_input = GagesDamDataModel(df, nid_input, True, gage_main_dam_purpose) for i in range(gage_main_dam_purpose_unique.size): gages_input = choose_which_purpose(data_input, purpose=gage_main_dam_purpose_unique[i]) save_datamodel(gages_input, gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json')
def test_gages_dam_stor_hist_basin(self): nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "test") dam_storages = unserialize_json( os.path.join(nid_dir, "dam_storages_dict.json")) sites = np.array(list(dam_storages.keys())) dor_2 = 0.02 source_data_dor2 = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_2) sites_id_largedam = source_data_dor2.all_configs['flow_screen_gage_id'] c, ind1, idx_lst_nse_range = np.intersect1d(sites, sites_id_largedam, return_indices=True) num = 4 num_lst = np.sort(np.random.choice(len(c), num, replace=False)) chosen_sites = c[num_lst] hist_bins = 20 fig = plt.figure(figsize=(8, 9)) gs = gridspec.GridSpec(2, 2) for i in range(num): ax_k = plt.subplot(gs[int(i / 2), i % 2]) ax_k.hist(dam_storages[chosen_sites[i]], hist_bins, orientation='vertical', color='red', alpha=0.5) plt.show()
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') # for 2nd model, datamodel of train and test belong to parts of the test time data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"] # Because we know data of period "90-95", so that we can get its statistics according to this period gages_model1_test = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_test, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, t_range_update=t_range2_train, data_attr_update=True) gages_model2_test = GagesModel.update_data_model( self.config_data_2, data_model_9505, t_range_update=t_range2_test, data_attr_update=True, train_stat_dict=gages_model2_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data_2.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input1 = GagesDamDataModel(gages_model1_test, nid_input, True, gage_main_dam_purpose) df1 = choose_which_purpose(data_input1) data_input2 = GagesDamDataModel(gages_model2_test, nid_input, True, gage_main_dam_purpose) df2 = choose_which_purpose(data_input2) with torch.cuda.device(2): data_model = GagesInvDataModel(df1, df2) pred, obs = test_lstm_inv(data_model, epoch=self.test_epoch) basin_area = df2.data_source.read_attr(df2.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = df2.data_source.read_attr(df2.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(df2.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs)
flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') nid_input = NidModel(cfg) nid_dir = os.path.join(cfg.NID.NID_DIR, "test") save_nidinput(nid_input, nid_dir, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') data_input = GagesDamDataModel(df, nid_input) serialize_json(data_input.gage_main_dam_purpose, os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose = unserialize_json(nid_gene_file) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_lst_merge = "".join(gage_main_dam_purpose_lst) gage_main_dam_purpose_unique = np.unique( list(gage_main_dam_purpose_lst_merge)) # gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if gage_main_dam_purpose_unique[i] in value: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = []
def test_purposes_seperate(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref-dam_95-05_nan-0.1_00-1.0") data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model = GagesModel.update_data_model(self.config_data, data_model_test) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if value == gage_main_dam_purpose_unique[i]: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] df_id_region = np.array(data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) pred_all, obs_all = load_result(self.config_data.data_path["Temp"], self.test_epoch) pred_all = pred_all.reshape(pred_all.shape[0], pred_all.shape[1]) obs_all = obs_all.reshape(obs_all.shape[0], obs_all.shape[1]) for i in range(9, len(gage_main_dam_purpose_unique)): pred = pred_all[id_regions_idx[i], :] obs = obs_all[id_regions_idx[i], :] inds = statError(obs, pred) inds['STAID'] = id_regions_sites_ids[i] inds_df = pd.DataFrame(inds) inds_df.to_csv( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + 'data_df.csv')) # plot box,使用seaborn库 keys = ["Bias", "RMSE", "NSE"] inds_test = subset_of_dict(inds, keys) box_fig = plot_diff_boxes(inds_test) box_fig.savefig( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "box_fig.png")) # plot ts sites = np.array(df_id_region[id_regions_idx[i]]) t_range = np.array(data_model.t_s_dict["t_final_range"]) show_me_num = 1 ts_fig = plot_ts_obs_pred(obs, pred, sites, t_range, show_me_num) ts_fig.savefig( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "ts_fig.png")) # plot nse ecdf sites_df_nse = pd.DataFrame({ "sites": sites, keys[2]: inds_test[keys[2]] }) plot_ecdf( sites_df_nse, keys[2], os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "ecdf_fig.png")) # plot map gauge_dict = data_model.data_source.gage_dict save_map_file = os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "map_fig.png") plot_map(gauge_dict, sites_df_nse, save_file=save_map_file, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def test_gages_nse_dam_attr(self): figure_dpi = 600 config_data = self.config_data data_dir = config_data.data_path["Temp"] data_model = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_id = data_model.t_s_dict["sites_id"] exp_lst = [ "basic_exp37", "basic_exp39", "basic_exp40", "basic_exp41", "basic_exp42", "basic_exp43" ] self.inds_df, pred_mean, obs_mean = load_ensemble_result( config_data.config_file, exp_lst, config_data.config_file.TEST_EPOCH, return_value=True) show_ind_key = 'NSE' plt.rcParams['font.family'] = 'serif' plt.rcParams['font.serif'] = ['Times New Roman' ] + plt.rcParams['font.serif'] # plot NSE-DOR attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] attrs_runavg_stor = data_model.data_source.read_attr( gages_id, attr_lst, is_return_dict=False) run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = attrs_runavg_stor[:, 1] * 1000 # m^3 dors = nor_storage / run_avg # dor = 0 is not totally same with dam_num=0 (some dammed basins' dor is about 0.00), # here for zero-dor we mainly rely on dam_num = 0 attr_dam_num = ["NDAMS_2009"] attrs_dam_num = data_model.data_source.read_attr(gages_id, attr_dam_num, is_return_dict=False) df = pd.DataFrame({ "DOR": dors, "DAM_NUM": attrs_dam_num[:, 0], show_ind_key: self.inds_df[show_ind_key].values }) hydro_logger.info("statistics of dors:\n %s", df.describe()) hydro_logger.info("percentiles of dors:\n %s", df.quantile(q=0.95)) hydro_logger.info("ecdf of dors:\n %s", ecdf(dors)) # boxplot # add a column to represent the dor range for the df dor_value_range_lst = [[0, 0], [0, 0.02], [0.02, 0.05], [0.05, 0.1], [0.1, 0.2], [0.2, 0.4], [0.4, 0.8], [0.8, 10000]] dor_range_lst = ["0"] + [ str(dor_value_range_lst[i][0]) + "-" + str(dor_value_range_lst[i][1]) for i in range(1, len(dor_value_range_lst) - 1) ] + [">" + str(dor_value_range_lst[-1][0])] # add a column to represent the dam_num range for the df dam_num_value_range_lst = [[0, 0], [0, 1], [1, 3], [3, 5], [5, 10], [10, 20], [20, 50], [50, 10000]] dam_num_range_lst = ["0", "1"] + [ str(dam_num_value_range_lst[i][0]) + "-" + str(dam_num_value_range_lst[i][1]) for i in range(2, len(dam_num_value_range_lst) - 1) ] + [">" + str(dam_num_value_range_lst[-1][0])] def in_which_range(value_temp): if value_temp == 0: return "0" the_range = [ a_range for a_range in dor_value_range_lst if a_range[0] < value_temp <= a_range[1] ] if the_range[0][0] == dor_value_range_lst[-1][0]: the_range_str = ">" + str(the_range[0][0]) else: the_range_str = str(the_range[0][0]) + "-" + str( the_range[0][1]) return the_range_str def in_which_dam_num_range(value_tmp): if value_tmp == 0: return "0" if value_tmp == 1: return "1" the_ran = [ a_ran for a_ran in dam_num_value_range_lst if a_ran[0] < value_tmp <= a_ran[1] ] if the_ran[0][0] == dam_num_value_range_lst[-1][0]: the_ran_str = ">" + str(the_ran[0][0]) else: the_ran_str = str(the_ran[0][0]) + "-" + str(the_ran[0][1]) return the_ran_str df["DOR_RANGE"] = df["DOR"].apply(in_which_range) df["DAM_NUM_RANGE"] = df["DAM_NUM"].apply(in_which_dam_num_range) df.loc[(df["DAM_NUM"] > 0) & (df["DOR_RANGE"] == "0"), "DOR_RANGE"] = dor_range_lst[1] shown_nse_range_boxplots = [-0.5, 1.0] sns.set(font="serif", font_scale=1.5, color_codes=True) plot_boxs(df, "DOR_RANGE", show_ind_key, ylim=shown_nse_range_boxplots, order=dor_range_lst) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR-boxplots-' + str(shown_nse_range_boxplots) + '.png'), dpi=figure_dpi, bbox_inches="tight") plt.figure() shown_nse_range_boxplots = [0, 1.0] sns.set(font="serif", font_scale=1.5, color_codes=True) plot_boxs(df, "DAM_NUM_RANGE", show_ind_key, ylim=shown_nse_range_boxplots, order=dam_num_range_lst) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DAM_NUM-boxplots-' + str(shown_nse_range_boxplots) + '.png'), dpi=figure_dpi, bbox_inches="tight") nums_in_dor_range = [ df[df["DOR_RANGE"] == a_range_rmp].shape[0] for a_range_rmp in dor_range_lst ] ratios_in_dor_range = [ a_num / df.shape[0] for a_num in nums_in_dor_range ] hydro_logger.info( "the number and ratio of basins in each dor range\n: %s \n %s", nums_in_dor_range, ratios_in_dor_range) nums_in_dam_num_range = [ df[df["DAM_NUM_RANGE"] == a_range_rmp].shape[0] for a_range_rmp in dam_num_range_lst ] ratios_in_dam_num_range = [ a_num / df.shape[0] for a_num in nums_in_dam_num_range ] hydro_logger.info( "the number and ratio of basins in each dam_num range\n: %s \n %s", nums_in_dam_num_range, ratios_in_dam_num_range) # regplot plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) sr = sns.regplot(x="DOR", y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_dor_max = df.quantile( q=0.95)["DOR"] # 30 # max(dors) # 0.8 # 10 show_dor_min = min(dors) plt.ylim(0, 1) plt.xlim(show_dor_min, show_dor_max) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR-shown-max-' + str(show_dor_max) + '.png'), dpi=figure_dpi, bbox_inches="tight") # jointplot # dor_range = [0.2, 0.9] dor_range = [0.002, 0.2] # plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="reg", # marginal_kws=dict(bins=25)) # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="hex", # color="b", marginal_kws=dict(bins=50)) g = sns.jointplot( x="DOR", y=show_ind_key, data=df[(df["DOR"] < dor_range[1]) & (df["DOR"] > dor_range[0]) & (df[show_ind_key] >= 0)], kind="hex", color="b") g.ax_marg_x.set_xlim(dor_range[0], dor_range[1]) # g.ax_marg_y.set_ylim(-0.5, 1) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR(range-)' + str(dor_range) + '-jointplot.png'), dpi=figure_dpi, bbox_inches="tight") nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "test") nid_input = NidModel.load_nidmodel( nid_dir, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(data_model, nid_input, gage_main_dam_purpose) dam_coords = unserialize_json_ordered( os.path.join(nid_dir, "dam_points_dict.json")) dam_storages = unserialize_json_ordered( os.path.join(nid_dir, "dam_storages_dict.json")) dam_ids_1 = list(gage_main_dam_purpose.keys()) dam_ids_2 = list(dam_coords.keys()) dam_ids_3 = list(dam_storages.keys()) assert (all(x < y for x, y in zip(dam_ids_1, dam_ids_1[1:]))) assert (all(x < y for x, y in zip(dam_ids_2, dam_ids_2[1:]))) assert (all(x < y for x, y in zip(dam_ids_3, dam_ids_3[1:]))) sites = list(dam_coords.keys()) c, ind1, idx_lst_nse_range = np.intersect1d(sites, gages_id, return_indices=True) std_storage_in_a_basin = list(map(np.std, dam_storages.values())) log_std_storage_in_a_basin = list( map(np.log, np.array(std_storage_in_a_basin) + 1)) nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ "DAM_STORAGE_STD": log_std_storage_in_a_basin, show_ind_key: nse_values }) plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.regplot(x="DAM_STORAGE_STD", y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_max = max(log_std_storage_in_a_basin) show_min = min(log_std_storage_in_a_basin) if show_min < 0: show_min = 0 # g.ax_marg_x.set_xlim(show_min, show_max) # g.ax_marg_y.set_ylim(0, 1) plt.ylim(0, 1) plt.xlim(show_min, show_max) plt.savefig(os.path.join(config_data.data_path["Out"], 'NSE~' + "DAM_STORAGE_STD" + '.png'), dpi=figure_dpi, bbox_inches="tight") gages_loc_lat = data_model.data_source.gage_dict["LAT_GAGE"] gages_loc_lon = data_model.data_source.gage_dict["LNG_GAGE"] gages_loc = [[gages_loc_lat[i], gages_loc_lon[i]] for i in range(len(gages_id))] # calculate index of dispersion, then plot the NSE-dispersion scatterplot # Geo coord system of gages_loc and dam_coords are both NAD83 coefficient_of_var = list( map(coefficient_of_variation, gages_loc, dam_coords.values())) coefficient_of_var_min = min(coefficient_of_var) coefficient_of_var_max = max(coefficient_of_var) dispersion_var = "DAM_GAGE_DIS_VAR" nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ dispersion_var: coefficient_of_var, show_ind_key: nse_values }) plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_max = coefficient_of_var_max show_min = coefficient_of_var_min if show_min < 0: show_min = 0 # g.ax_marg_x.set_xlim(show_min, show_max) # g.ax_marg_y.set_ylim(0, 1) plt.ylim(0, 1) plt.xlim(show_min, show_max) plt.savefig(os.path.join(config_data.data_path["Out"], 'NSE~' + dispersion_var + '.png'), dpi=figure_dpi, bbox_inches="tight") idx_dispersions = list( map(ind_of_dispersion, gages_loc, dam_coords.values())) idx_dispersion_min = min(idx_dispersions) idx_dispersion_max = max(idx_dispersions) dispersion_var = "DAM_DISPERSION_BASIN" # nse_range = [0, 1] # idx_lst_nse_range = inds_df_now[(inds_df_now[show_ind_key] >= nse_range[0]) & (inds_df_now[show_ind_key] < nse_range[1])].index.tolist() nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ dispersion_var: idx_dispersions, show_ind_key: nse_values }) # g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) if idx_dispersion_min < 0: idx_dispersion_min = 0 plt.ylim(0, 1) plt.xlim(idx_dispersion_min, idx_dispersion_max) # plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.jointplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], kind="reg") g.ax_marg_x.set_xlim(idx_dispersion_min, idx_dispersion_max) g.ax_marg_y.set_ylim(0, 1) plt.show()
def test_dam_train(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_sim8595 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') sim_gages_model_train = GagesModel.update_data_model( self.sim_config_data, data_model_sim8595, data_attr_update=True) gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_train, nid_input, True, gage_main_dam_purpose) with torch.cuda.device(0): for i in range(0, gage_main_dam_purpose_unique.size): sim_gages_model_train.update_model_param('train', nEpoch=300) gages_input = choose_which_purpose( data_input, purpose=gage_main_dam_purpose_unique[i]) new_temp_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"] ["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"] ["Out"], gage_main_dam_purpose_unique[i]) gages_input.update_datamodel_dir(new_temp_dir, new_out_dir) data_model = GagesSimDataModel(sim_gages_model_train, gages_input) # pre_trained_model_epoch = 25 # master_train_natural_flow(data_model, pre_trained_model_epoch=pre_trained_model_epoch) master_train_natural_flow(data_model)
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_sim8595 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_sim9505 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') sim_gages_model_train = GagesModel.update_data_model( self.sim_config_data, data_model_sim8595, data_attr_update=True) gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True) sim_gages_model_test = GagesModel.update_data_model( self.sim_config_data, data_model_sim9505, data_attr_update=True, train_stat_dict=sim_gages_model_train.stat_dict) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_9505, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) data_input = GagesDamDataModel(gages_model_test, nid_input, True, gage_main_dam_purpose) for i in range(0, gage_main_dam_purpose_unique.size): sim_gages_model_test.update_model_param('train', nEpoch=300) gages_input = choose_which_purpose( data_input, purpose=gage_main_dam_purpose_unique[i]) new_temp_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"]["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"]["Out"], gage_main_dam_purpose_unique[i]) gages_input.update_datamodel_dir(new_temp_dir, new_out_dir) model_input = GagesSimDataModel(sim_gages_model_test, gages_input) pred, obs = master_test_natural_flow(model_input, epoch=self.test_epoch) basin_area = model_input.data_model2.data_source.read_attr( model_input.data_model2.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = model_input.data_model2.data_source.read_attr( model_input.data_model2.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result( model_input.data_model2.data_source.data_config. data_path['Temp'], str(self.test_epoch), pred, obs) plot_we_need(gages_input, obs, pred, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def test_3factors(self): data_model = self.data_model config_data = self.config_data test_epoch = self.test_epoch # plot three factors attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] usgs_id = data_model.t_s_dict["sites_id"] attrs_runavg_stor = data_model.data_source.read_attr( usgs_id, attr_lst, is_return_dict=False) run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = attrs_runavg_stor[:, 1] * 1000 # m^3 dors_value = nor_storage / run_avg dors = np.full(len(usgs_id), "dor<0.02") for i in range(len(usgs_id)): if dors_value[i] >= 0.02: dors[i] = "dor≥0.02" diversions = np.full(len(usgs_id), "no ") diversion_strs = ["diversion", "divert"] attr_lst = ["WR_REPORT_REMARKS", "SCREENING_COMMENTS"] data_attr = data_model.data_source.read_attr_origin(usgs_id, attr_lst) diversion_strs_lower = [elem.lower() for elem in diversion_strs] data_attr0_lower = np.array([ elem.lower() if type(elem) == str else elem for elem in data_attr[0] ]) data_attr1_lower = np.array([ elem.lower() if type(elem) == str else elem for elem in data_attr[1] ]) data_attr_lower = np.vstack((data_attr0_lower, data_attr1_lower)).T for i in range(len(usgs_id)): if is_any_elem_in_a_lst(diversion_strs_lower, data_attr_lower[i], include=True): diversions[i] = "yes" nid_dir = os.path.join( "/".join(config_data.data_path["DB"].split("/")[:-1]), "nid", "test") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_lst_merge = "".join(gage_main_dam_purpose_lst) gage_main_dam_purpose_unique = np.unique( list(gage_main_dam_purpose_lst_merge)) # gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if gage_main_dam_purpose_unique[i] in value: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] regions_name = [] show_min_num = 10 df_id_region = np.array(data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) if c.size < show_min_num: continue assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) regions_name.append(key) preds, obss, inds_dfs = split_results_to_regions( data_model, test_epoch, id_regions_idx, id_regions_sites_ids) frames = [] x_name = "purposes" y_name = "NSE" hue_name = "DOR" col_name = "diversion" for i in range(len(id_regions_idx)): # plot box,使用seaborn库 keys = ["NSE"] inds_test = subset_of_dict(inds_dfs[i], keys) inds_test = inds_test[keys[0]].values df_dict_i = {} str_i = regions_name[i] df_dict_i[x_name] = np.full([inds_test.size], str_i) df_dict_i[y_name] = inds_test df_dict_i[hue_name] = dors[id_regions_idx[i]] df_dict_i[col_name] = diversions[id_regions_idx[i]] # df_dict_i[hue_name] = nor_storage[id_regions_idx[i]] df_i = pd.DataFrame(df_dict_i) frames.append(df_i) result = pd.concat(frames) plot_boxs(result, x_name, y_name, ylim=[0, 1.0]) plt.savefig(os.path.join(config_data.data_path["Out"], 'purpose_distribution.png'), dpi=500, bbox_inches="tight") # g = sns.catplot(x=x_name, y=y_name, hue=hue_name, col=col_name, # data=result, kind="swarm", # height=4, aspect=.7) sns.set(font_scale=1.5) fig, ax = plt.subplots() fig.set_size_inches(11.7, 8.27) g = sns.catplot(ax=ax, x=x_name, y=y_name, hue=hue_name, col=col_name, data=result, palette="Set1", kind="box", dodge=True, showfliers=False) # g.set(ylim=(-1, 1)) plt.savefig(os.path.join(config_data.data_path["Out"], '3factors_distribution.png'), dpi=500, bbox_inches="tight") plt.show()
def test_scatter_dam_purpose(self): attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] sites_nonref = self.data_model.t_s_dict["sites_id"] attrs_runavg_stor = self.data_model.data_source.read_attr( sites_nonref, attr_lst, is_return_dict=False) run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = attrs_runavg_stor[:, 1] * 1000 # m^3 dors = nor_storage / run_avg nid_dir = os.path.join(self.config_data.data_path["DB"], "nid", "test") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if value == gage_main_dam_purpose_unique[i]: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] regions_name = [] show_min_num = 10 df_id_region = np.array(self.data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) if c.size < show_min_num: continue assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) regions_name.append(key) preds, obss, inds_dfs = split_results_to_regions( self.data_model, self.test_epoch, id_regions_idx, id_regions_sites_ids) frames = [] x_name = "purposes" y_name = "NSE" hue_name = "DOR" # hue_name = "STOR" for i in range(len(id_regions_idx)): # plot box,使用seaborn库 keys = ["NSE"] inds_test = subset_of_dict(inds_dfs[i], keys) inds_test = inds_test[keys[0]].values df_dict_i = {} str_i = regions_name[i] df_dict_i[x_name] = np.full([inds_test.size], str_i) df_dict_i[y_name] = inds_test df_dict_i[hue_name] = dors[id_regions_idx[i]] # df_dict_i[hue_name] = nor_storage[id_regions_idx[i]] df_i = pd.DataFrame(df_dict_i) frames.append(df_i) result = pd.concat(frames) # can remove high hue value to keep a good map plot_boxs(result, x_name, y_name, ylim=[-1.0, 1.0]) plt.savefig(os.path.join(self.config_data.data_path["Out"], 'purpose_distribution_test.png'), dpi=500, bbox_inches="tight") plt.show() # plot_boxs(result, x_name, y_name, uniform_color="skyblue", swarm_plot=True, hue=hue_name, colormap=True, # ylim=[-1.0, 1.0]) cmap_str = 'viridis' # cmap = plt.get_cmap('Spectral') cbar_label = hue_name plt.title('Distribution of different purposes') swarmplot_with_cbar(cmap_str, cbar_label, [-1, 1.0], x=x_name, y=y_name, hue=hue_name, palette=cmap_str, data=result)