def test_dam_train(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_sim8595 = GagesModel.load_datamodel(sim_data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_8595 = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') sim_gages_model_train = GagesModel.update_data_model(self.sim_config_data, data_model_sim8595, data_attr_update=True) gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True) sim_gages_model_train.update_model_param('train', nEpoch=300) nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_train, nid_input, True, gage_main_dam_purpose) gages_input = choose_which_purpose(data_input) with torch.cuda.device(2): data_model = GagesSimDataModel(sim_gages_model_train, gages_input) # pre_trained_model_epoch = 230 # master_train_natural_flow(data_model, pre_trained_model_epoch=pre_trained_model_epoch) master_train_natural_flow(data_model)
def test_gages_data_model_quickdata(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model(self.config_data, data_model_test, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_test, nid_input, True, gage_main_dam_purpose) gages_input = choose_which_purpose(data_input) pred, obs = master_test(gages_input) basin_area = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(gages_input.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs)
def test_dam_train(self): quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"] gages_model1_train = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_train, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, t_range_update=t_range2_train, data_attr_update=True) nid_dir = os.path.join( "/".join(self.config_data_1.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) with torch.cuda.device(1): for i in range(0, gage_main_dam_purpose_unique.size): data_input1 = GagesDamDataModel(gages_model1_train, nid_input, True, gage_main_dam_purpose) gages_input1 = choose_which_purpose( data_input1, purpose=gage_main_dam_purpose_unique[i]) data_input2 = GagesDamDataModel(gages_model2_train, nid_input, True, gage_main_dam_purpose) gages_input2 = choose_which_purpose( data_input2, purpose=gage_main_dam_purpose_unique[i]) data_model = GagesInvDataModel(gages_input1, gages_input2) # pre_trained_model_epoch = 165 train_lstm_inv(data_model)
def test_gages_dam_all_save(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train) data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "test") nid_input = NidModel.load_nidmodel( nid_dir, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(gages_model_test, nid_input, gage_main_dam_purpose) data_model_dam = choose_which_purpose(data_input) save_datamodel(data_model_dam, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json')
def test_da_data_temp(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_purposes_inds(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref-dam_95-05_nan-0.1_00-1.0") data_model = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_data_model = GagesModel.update_data_model( self.config_data, data_model) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if value == gage_main_dam_purpose_unique[i]: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] df_id_region = np.array(gages_data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) preds, obss, inds_dfs = split_results_to_regions( gages_data_model, self.test_epoch, id_regions_idx, id_regions_sites_ids) region_names = list(purpose_regions.keys()) inds_medians = [] inds_means = [] for i in range(len(region_names)): inds_medians.append(inds_dfs[i].median(axis=0)) inds_means.append(inds_dfs[i].mean(axis=0)) print(inds_medians) print(inds_means)
def test_some_reservoirs(self): """choose some small reservoirs to train and test""" # 读取模型配置文件 config_data = self.config_data source_data = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"], major_dam=1) sites_id = source_data.all_configs['flow_screen_gage_id'] quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_train = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train, sites_id_update=sites_id) gages_model_test = GagesModel.update_data_model(self.config_data, data_model_test, sites_id_update=sites_id, train_stat_dict=gages_model_train.stat_dict) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_damcls_test_datamodel(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_train = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train) df = GagesModel.update_data_model(self.config_data, data_model_test, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) data_input = GagesDamDataModel(df, nid_input, True, gage_main_dam_purpose) for i in range(gage_main_dam_purpose_unique.size): gages_input = choose_which_purpose(data_input, purpose=gage_main_dam_purpose_unique[i]) save_datamodel(gages_input, gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json')
def test_dam_test(self): with torch.cuda.device(1): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref-dam_95-05_nan-0.1_00-1.0") data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_input = GagesModel.update_data_model(self.config_data, data_model_test) pred, obs = master_test(gages_input, epoch=self.test_epoch) basin_area = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = gages_input.data_source.read_attr( gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(gages_input.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs) plot_we_need(gages_input, obs, pred, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def test_siminv_data_temp(self): quick_data_dir = os.path.join(self.config_data_sim.data_path["DB"], "quickdata") data_dir_allref = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir_allnonref = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_allref_8595 = GagesModel.load_datamodel( data_dir_allref, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_allref_9505 = GagesModel.load_datamodel( data_dir_allref, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model_allnonref_8595 = GagesModel.load_datamodel( data_dir_allnonref, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_allnonref_9505 = GagesModel.load_datamodel( data_dir_allnonref, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') t_range_sim_train = self.config_data_sim.model_dict["data"][ "tRangeTrain"] t_range_sim_test = self.config_data_sim.model_dict["data"][ "tRangeTest"] sim_gages_model_train = GagesModel.update_data_model( self.config_data_sim, data_model_allref_8595, t_range_update=t_range_sim_train, data_attr_update=True) sim_gages_model_test = GagesModel.update_data_model( self.config_data_sim, data_model_allref_8595, t_range_update=t_range_sim_test, data_attr_update=True) t_range_inv_train = self.config_data_inv.model_dict["data"][ "tRangeTrain"] t_range_inv_test = self.config_data_inv.model_dict["data"][ "tRangeTest"] inv_gages_model_train = GagesModel.update_data_model( self.config_data_inv, data_model_allnonref_8595, t_range_update=t_range_inv_train, data_attr_update=True) inv_gages_model_test = GagesModel.update_data_model( self.config_data_inv, data_model_allnonref_8595, t_range_update=t_range_inv_test, data_attr_update=True) t_range_train = self.config_data.model_dict["data"]["tRangeTrain"] t_range_test = self.config_data.model_dict["data"]["tRangeTest"] gages_model_train = GagesModel.update_data_model( self.config_data, data_model_allnonref_8595, t_range_update=t_range_train, data_attr_update=True) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_allnonref_9505, t_range_update=t_range_test, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict) save_datamodel(sim_gages_model_train, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(sim_gages_model_test, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(inv_gages_model_train, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(inv_gages_model_test, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model_train, "3", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, "3", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_inv_data_temp(self): # data1 is historical data as input of LSTM-Inv, which will be a kernel for the second LSTM quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') # for 2nd model, datamodel of train and test belong to parts of the test time data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"] t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"] gages_model1_train = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_train, data_attr_update=True) # Because we know data of period "90-95", so that we can get its statistics according to this period gages_model1_test = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_test, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, t_range_update=t_range2_train, data_attr_update=True) gages_model2_test = GagesModel.update_data_model( self.config_data_2, data_model_9505, t_range_update=t_range2_test, data_attr_update=True, train_stat_dict=gages_model2_train.stat_dict) save_datamodel(gages_model1_train, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model1_test, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model2_train, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model2_test, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') # for 2nd model, datamodel of train and test belong to parts of the test time data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"] # Because we know data of period "90-95", so that we can get its statistics according to this period gages_model1_test = GagesModel.update_data_model( self.config_data_1, data_model_8595, t_range_update=t_range1_test, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, t_range_update=t_range2_train, data_attr_update=True) gages_model2_test = GagesModel.update_data_model( self.config_data_2, data_model_9505, t_range_update=t_range2_test, data_attr_update=True, train_stat_dict=gages_model2_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data_2.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input1 = GagesDamDataModel(gages_model1_test, nid_input, True, gage_main_dam_purpose) df1 = choose_which_purpose(data_input1) data_input2 = GagesDamDataModel(gages_model2_test, nid_input, True, gage_main_dam_purpose) df2 = choose_which_purpose(data_input2) with torch.cuda.device(2): data_model = GagesInvDataModel(df1, df2) pred, obs = test_lstm_inv(data_model, epoch=self.test_epoch) basin_area = df2.data_source.read_attr(df2.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = df2.data_source.read_attr(df2.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(df2.data_source.data_config.data_path['Temp'], self.test_epoch, pred, obs)
for i in range(camels_pub_split_num): data_model_i = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') gages_model_test = GagesModel.update_data_model( gages_configs_test[j], data_model_test, sites_id_update=gages_sites_test[j], data_attr_update=True, train_stat_dict=data_model_i.stat_dict, screen_basin_area_huc4=False) with torch.cuda.device(0): pretrained_model_file = os.path.join( data_model_i.data_source.data_config.data_path["Out"], "model_Ep" + str(test_epoch) + ".pt") pretrained_model_name = camels_exp_lst[ 0] + "_pretrained_model" + str(i) pred, obs = master_test_with_pretrained_model( gages_model_test, pretrained_model_file, pretrained_model_name) basin_area = gages_model_test.data_source.read_attr( gages_model_test.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False)
def synergy_ecoregion(args): update_cfg(cfg, args) cache = cfg.CACHE.STATE train_mode = cfg.TRAIN_MODE test_epoch = cfg.TEST_EPOCH config_data = GagesConfig(cfg) eco_names = [("ECO2_CODE", 5.2), ("ECO2_CODE", 5.3), ("ECO2_CODE", 6.2), ("ECO2_CODE", 7.1), ("ECO2_CODE", 8.1), ("ECO2_CODE", 8.2), ("ECO2_CODE", 8.3), ("ECO2_CODE", 8.4), ("ECO2_CODE", 8.5), ("ECO2_CODE", 9.2), ("ECO2_CODE", 9.3), ("ECO2_CODE", 9.4), ("ECO2_CODE", 9.5), ("ECO2_CODE", 9.6), ("ECO2_CODE", 10.1), ("ECO2_CODE", 10.2), ("ECO2_CODE", 10.4), ("ECO2_CODE", 11.1), ("ECO2_CODE", 12.1), ("ECO2_CODE", 13.1)] quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') for eco_name in eco_names: source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) sites_id = source_data.all_configs['flow_screen_gage_id'] sites_id_inter = np.intersect1d(data_model_train.t_s_dict["sites_id"], sites_id) if sites_id_inter.size < 1: continue config_data = GagesConfig.set_subdir(cfg, str(eco_name[1])) gages_model_train = GagesModel.update_data_model( config_data, data_model_train, sites_id_update=sites_id, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( config_data, data_model_test, sites_id_update=sites_id, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) if cache: save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("save ecoregion " + str(eco_name[1]) + " data model") with torch.cuda.device(0): if train_mode: master_train(gages_model_train) pred, obs = master_test(gages_model_test, epoch=test_epoch) basin_area = gages_model_test.data_source.read_attr( gages_model_test.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = gages_model_test.data_source.read_attr( gages_model_test.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result( gages_model_test.data_source.data_config.data_path['Temp'], test_epoch, pred, obs)
def test_gages_data_model(self): config_data = self.config_data dam_num = 0 source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=dam_num) sites_id = source_data.all_configs['flow_screen_gage_id'] quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train, sites_id_update=sites_id, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, sites_id_update=sites_id, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_gages_data_model(self): config_data = self.config_data major_dam_num = [1, 200] # max major dam num is 155 if cfg.CACHE.QUICK_DATA: source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=major_dam_num) sites_id = source_data.all_configs['flow_screen_gage_id'] print("The binary data has exsited") quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train, sites_id_update=sites_id, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, sites_id_update=sites_id, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) else: gages_model = GagesModels(config_data, screen_basin_area_huc4=False, major_dam_num=major_dam_num) gages_model_train = gages_model.data_model_train gages_model_test = gages_model.data_model_test if cfg.CACHE.STATE: save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_siminv_data_temp(self): quick_data_dir = os.path.join(self.config_data_natflow.data_path["DB"], "quickdata") # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id = data_model_8595.t_s_dict["sites_id"] nomajordam_source_data = GagesSource.choose_some_basins( self.config_data_natflow, self.config_data_natflow.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=0) nomajordam_sites_id = nomajordam_source_data.all_configs[ 'flow_screen_gage_id'] nomajordam_in_conus = np.intersect1d(conus_sites_id, nomajordam_sites_id) majordam_source_data = GagesSource.choose_some_basins( self.config_data_natflow, self.config_data_natflow.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=[1, 2000]) majordam_sites_id = majordam_source_data.all_configs[ 'flow_screen_gage_id'] majordam_in_conus = np.intersect1d(conus_sites_id, majordam_sites_id) gages_model_train_natflow = GagesModel.update_data_model( self.config_data_natflow, data_model_8595, sites_id_update=nomajordam_in_conus, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_natflow = GagesModel.update_data_model( self.config_data_natflow, data_model_9505, sites_id_update=nomajordam_in_conus, data_attr_update=True, train_stat_dict=gages_model_train_natflow.stat_dict, screen_basin_area_huc4=False) gages_model_train_lstm = GagesModel.update_data_model( self.config_data_lstm, data_model_8595, sites_id_update=majordam_in_conus, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_lstm = GagesModel.update_data_model( self.config_data_lstm, data_model_9505, sites_id_update=majordam_in_conus, data_attr_update=True, train_stat_dict=gages_model_train_lstm.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_natflow, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_natflow, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model_train_lstm, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_lstm, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_some_reservoirs(self): config_data = self.config_data dam_num = 0 dor = 0.02 source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) # basins with dams source_data_withoutdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=dam_num) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_withoutdams = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] sites_id = np.sort( np.union1d(np.array(sites_id_dor1), np.array(sites_id_withoutdams))).tolist() quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train, sites_id_update=sites_id, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, sites_id_update=sites_id, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_gages_sim_data_model(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_sim8595 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_sim9505 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') sim_gages_model_train = GagesModel.update_data_model( self.sim_config_data, data_model_sim8595, data_attr_update=True) gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True) sim_gages_model_test = GagesModel.update_data_model( self.sim_config_data, data_model_sim9505, data_attr_update=True, train_stat_dict=sim_gages_model_train.stat_dict) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_9505, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict) save_datamodel(sim_gages_model_train, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(sim_gages_model_test, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model_train, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_dam_test(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") data_model_sim8595 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_sim9505 = GagesModel.load_datamodel( sim_data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') sim_gages_model_train = GagesModel.update_data_model( self.sim_config_data, data_model_sim8595, data_attr_update=True) gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True) sim_gages_model_test = GagesModel.update_data_model( self.sim_config_data, data_model_sim9505, data_attr_update=True, train_stat_dict=sim_gages_model_train.stat_dict) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_9505, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") nid_input = NidModel.load_nidmodel( nid_dir, nid_file=self.nid_file, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) data_input = GagesDamDataModel(gages_model_test, nid_input, True, gage_main_dam_purpose) for i in range(0, gage_main_dam_purpose_unique.size): sim_gages_model_test.update_model_param('train', nEpoch=300) gages_input = choose_which_purpose( data_input, purpose=gage_main_dam_purpose_unique[i]) new_temp_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"]["Temp"], gage_main_dam_purpose_unique[i]) new_out_dir = os.path.join( gages_input.data_source.data_config.model_dict["dir"]["Out"], gage_main_dam_purpose_unique[i]) gages_input.update_datamodel_dir(new_temp_dir, new_out_dir) model_input = GagesSimDataModel(sim_gages_model_test, gages_input) pred, obs = master_test_natural_flow(model_input, epoch=self.test_epoch) basin_area = model_input.data_model2.data_source.read_attr( model_input.data_model2.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = model_input.data_model2.data_source.read_attr( model_input.data_model2.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result( model_input.data_model2.data_source.data_config. data_path['Temp'], str(self.test_epoch), pred, obs) plot_we_need(gages_input, obs, pred, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def test_some_reservoirs(self): """choose some small reservoirs for 2nd lstm not for simulate""" # 读取模型配置文件 config_data = self.config_data_lstm # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management" dor = 0.02 source_data = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) sites_id_dor = source_data.all_configs['flow_screen_gage_id'] quick_data_dir = os.path.join(self.config_data_lstm.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_8595 = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_9505 = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id_all = data_model_8595.t_s_dict["sites_id"] nomajordam_source_data = GagesSource.choose_some_basins(self.config_data_natflow, self.config_data_natflow.model_dict["data"][ "tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=0) nomajordam_sites_id = nomajordam_source_data.all_configs['flow_screen_gage_id'] # In no major dam case, all sites are chosen as natural flow generator nomajordam_in_conus = np.intersect1d(conus_sites_id_all, nomajordam_sites_id) conus_sites_id_dor = np.intersect1d(conus_sites_id_all, sites_id_dor) majordam_source_data = GagesSource.choose_some_basins(self.config_data_natflow, self.config_data_natflow.model_dict["data"][ "tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=[1, 2000]) majordam_sites_id = majordam_source_data.all_configs['flow_screen_gage_id'] majordam_in_conus = np.intersect1d(conus_sites_id_dor, majordam_sites_id) gages_model_train_natflow = GagesModel.update_data_model(self.config_data_natflow, data_model_8595, sites_id_update=nomajordam_in_conus, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_natflow = GagesModel.update_data_model(self.config_data_natflow, data_model_9505, sites_id_update=nomajordam_in_conus, data_attr_update=True, train_stat_dict=gages_model_train_natflow.stat_dict, screen_basin_area_huc4=False) gages_model_train_lstm = GagesModel.update_data_model(self.config_data_lstm, data_model_8595, sites_id_update=majordam_in_conus, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_lstm = GagesModel.update_data_model(self.config_data_lstm, data_model_9505, sites_id_update=majordam_in_conus, data_attr_update=True, train_stat_dict=gages_model_train_lstm.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_natflow, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_natflow, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model_train_lstm, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_lstm, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_some_reservoirs(self): # # a control group for simulate/exp3 dor = -0.02 # meaning dor < 0.02 source_data = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) sites_id_dor = source_data.all_configs['flow_screen_gage_id'] quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_9000 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_0010 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id_all = data_model_9000.t_s_dict["sites_id"] nomajordam_source_data = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=0) nomajordam_sites_id = nomajordam_source_data.all_configs[ 'flow_screen_gage_id'] # In no major dam case, all sites are chosen as natural flow generator nomajordam_in_conus = np.intersect1d(conus_sites_id_all, nomajordam_sites_id) conus_sites_id_dor = np.intersect1d(conus_sites_id_all, sites_id_dor) majordam_source_data = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=[1, 2000]) majordam_sites_id = majordam_source_data.all_configs[ 'flow_screen_gage_id'] majordam_in_conus = np.intersect1d(conus_sites_id_dor, majordam_sites_id) chosen_sites_id = np.sort( np.append(nomajordam_in_conus, majordam_in_conus)) gages_model_train_lstm = GagesModel.update_data_model( self.config_data, data_model_9000, sites_id_update=chosen_sites_id, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_lstm = GagesModel.update_data_model( self.config_data, data_model_0010, sites_id_update=chosen_sites_id, data_attr_update=True, train_stat_dict=gages_model_train_lstm.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_lstm, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_lstm, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def pub_lstm(args): update_cfg(cfg, args) random_seed = cfg.RANDOM_SEED test_epoch = cfg.TEST_EPOCH gpu_num = cfg.CTX train_mode = cfg.TRAIN_MODE cache = cfg.CACHE.STATE pub_plan = cfg.PUB_PLAN plus = cfg.PLUS dor = cfg.GAGES.attrScreenParams.DOR split_num = cfg.SPLIT_NUM print("train and test for PUB: \n") config_data = GagesConfig(cfg) if cache: eco_names = [ ("ECO2_CODE", 5.2), ("ECO2_CODE", 5.3), ("ECO2_CODE", 6.2), ("ECO2_CODE", 7.1), ("ECO2_CODE", 8.1), ("ECO2_CODE", 8.2), ("ECO2_CODE", 8.3), ("ECO2_CODE", 8.4), ("ECO2_CODE", 8.5), ("ECO2_CODE", 9.2), ("ECO2_CODE", 9.3), ("ECO2_CODE", 9.4), ("ECO2_CODE", 9.5), ("ECO2_CODE", 9.6), ("ECO2_CODE", 10.1), ("ECO2_CODE", 10.2), ("ECO2_CODE", 10.4), ("ECO2_CODE", 11.1), ("ECO2_CODE", 12.1), ("ECO2_CODE", 13.1) ] quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id = data_model_train.t_s_dict["sites_id"] if pub_plan == 0: """do a pub test like freddy's""" camels531_gageid_file = os.path.join(config_data.data_path["DB"], "camels531", "camels531.txt") gauge_df = pd.read_csv(camels531_gageid_file, dtype={"GaugeID": str}) gauge_list = gauge_df["GaugeID"].values all_sites_camels_531 = np.sort( [str(gauge).zfill(8) for gauge in gauge_list]) sites_id_train = np.intersect1d(conus_sites_id, all_sites_camels_531) # basins not in CAMELS sites_id_test = [ a_temp_site for a_temp_site in conus_sites_id if a_temp_site not in all_sites_camels_531 ] assert (all(x < y for x, y in zip(sites_id_test, sites_id_test[1:]))) elif pub_plan == 1 or pub_plan == 4: source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=-dor) # basins with dams source_data_withdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=[1, 100000]) # basins without dams source_data_withoutdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=0) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_withdams = source_data_withdams.all_configs[ 'flow_screen_gage_id'] if pub_plan == 1: sites_id_train = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] sites_id_test = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() else: sites_id_train = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() sites_id_test = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] elif pub_plan == 2 or pub_plan == 5: source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) # basins without dams source_data_withoutdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=0) if pub_plan == 2: sites_id_train = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] sites_id_test = source_data_dor1.all_configs[ 'flow_screen_gage_id'] else: sites_id_train = source_data_dor1.all_configs[ 'flow_screen_gage_id'] sites_id_test = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] elif pub_plan == 3 or pub_plan == 6: dor_1 = -dor dor_2 = dor source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_1) # basins with dams source_data_withdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=[1, 100000]) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_withdams = source_data_withdams.all_configs[ 'flow_screen_gage_id'] source_data_dor2 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_2) if pub_plan == 3: sites_id_train = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() sites_id_test = source_data_dor2.all_configs[ 'flow_screen_gage_id'] else: sites_id_train = source_data_dor2.all_configs[ 'flow_screen_gage_id'] sites_id_test = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() else: print("wrong plan") sites_id_train = None sites_id_test = None train_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_train) test_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_test) if plus == 0: all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] all_index_lst_test_2 = [] sites_lst_test_2 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] train_sites_id_inter = np.intersect1d(train_sites_in_conus, eco_sites_id) test_sites_id_inter = np.intersect1d(test_sites_in_conus, eco_sites_id) if train_sites_id_inter.size < split_num or test_sites_id_inter.size < 1: continue for train, test in kf.split(train_sites_id_inter): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_id_inter[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_id_inter[test]) if test_sites_id_inter.size < test.size: all_index_lst_test_2.append( np.arange(test_sites_id_inter.size)) sites_lst_test_2.append(test_sites_id_inter) else: test2_chosen_idx = np.random.choice( test_sites_id_inter.size, test.size, replace=False) all_index_lst_test_2.append(test2_chosen_idx) sites_lst_test_2.append( test_sites_id_inter[test2_chosen_idx]) eco_name_chosen.append(eco_name) elif plus == -1: print("camels pub, only do pub on the camels basins") all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] train_sites_id_inter = np.intersect1d(train_sites_in_conus, eco_sites_id) if train_sites_id_inter.size < split_num: continue for train, test in kf.split(train_sites_id_inter): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_id_inter[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_id_inter[test]) eco_name_chosen.append(eco_name) elif plus == -2: print( "camels pub, only do pub on the camels basins, same with freddy's split method" ) all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) for train, test in kf.split(train_sites_in_conus): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_in_conus[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_in_conus[test]) else: sites_lst_train = [] sites_lst_test_1 = [] sites_lst_test_2 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] sites_id_inter_1 = np.intersect1d(train_sites_in_conus, eco_sites_id) sites_id_inter_2 = np.intersect1d(test_sites_in_conus, eco_sites_id) if sites_id_inter_1.size < sites_id_inter_2.size: if sites_id_inter_1.size < split_num: continue for train, test in kf.split(sites_id_inter_1): sites_lst_train_1 = sites_id_inter_1[train] sites_lst_test_1.append(sites_id_inter_1[test]) chosen_lst_2 = random_choice_no_return( sites_id_inter_2, [train.size, test.size]) sites_lst_train_2 = chosen_lst_2[0] sites_lst_test_2.append(chosen_lst_2[1]) sites_lst_train.append( np.sort( np.append(sites_lst_train_1, sites_lst_train_2))) else: if sites_id_inter_2.size < split_num: continue for train, test in kf.split(sites_id_inter_2): sites_lst_train_2 = sites_id_inter_2[train] sites_lst_test_2.append(sites_id_inter_2[test]) chosen_lst_1 = random_choice_no_return( sites_id_inter_1, [train.size, test.size]) sites_lst_train_1 = chosen_lst_1[0] sites_lst_test_1.append(chosen_lst_1[1]) sites_lst_train.append( np.sort( np.append(sites_lst_train_1, sites_lst_train_2))) eco_name_chosen.append(eco_name) for i in range(split_num): sites_ids_train_ilst = [ sites_lst_train[j] for j in range(len(sites_lst_train)) if j % split_num == i ] sites_ids_train_i = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst)) sites_ids_test_ilst_1 = [ sites_lst_test_1[j] for j in range(len(sites_lst_test_1)) if j % split_num == i ] sites_ids_test_i_1 = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_1)) if plus >= 0: sites_ids_test_ilst_2 = [ sites_lst_test_2[j] for j in range(len(sites_lst_test_2)) if j % split_num == i ] sites_ids_test_i_2 = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_2)) config_data_i = GagesConfig.set_subdir(cfg, str(i)) gages_model_train_i = GagesModel.update_data_model( config_data_i, data_model_train, sites_id_update=sites_ids_train_i, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_baseline_i = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_train_i, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) gages_model_test_i_1 = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_test_i_1, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) if plus >= 0: gages_model_test_i_2 = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_test_i_2, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_i, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_baseline_i, data_source_file_name='test_data_source_base.txt', stat_file_name='test_Statistics_base.json', flow_file_name='test_flow_base', forcing_file_name='test_forcing_base', attr_file_name='test_attr_base', f_dict_file_name='test_dictFactorize_base.json', var_dict_file_name='test_dictAttribute_base.json', t_s_dict_file_name='test_dictTimeSpace_base.json') save_datamodel(gages_model_test_i_1, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') if plus >= 0: save_datamodel(gages_model_test_i_2, data_source_file_name='test_data_source_2.txt', stat_file_name='test_Statistics_2.json', flow_file_name='test_flow_2', forcing_file_name='test_forcing_2', attr_file_name='test_attr_2', f_dict_file_name='test_dictFactorize_2.json', var_dict_file_name='test_dictAttribute_2.json', t_s_dict_file_name='test_dictTimeSpace_2.json') print("save ecoregion " + str(i) + " data model") with torch.cuda.device(gpu_num): if train_mode: for i in range(split_num): data_model = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') master_train(data_model, random_seed=random_seed) for i in range(split_num): data_model_baseline = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source_base.txt', stat_file_name='test_Statistics_base.json', flow_file_name='test_flow_base.npy', forcing_file_name='test_forcing_base.npy', attr_file_name='test_attr_base.npy', f_dict_file_name='test_dictFactorize_base.json', var_dict_file_name='test_dictAttribute_base.json', t_s_dict_file_name='test_dictTimeSpace_base.json') data_model = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') if plus >= 0: data_model_2 = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source_2.txt', stat_file_name='test_Statistics_2.json', flow_file_name='test_flow_2.npy', forcing_file_name='test_forcing_2.npy', attr_file_name='test_attr_2.npy', f_dict_file_name='test_dictFactorize_2.json', var_dict_file_name='test_dictAttribute_2.json', t_s_dict_file_name='test_dictTimeSpace_2.json') pred_baseline, obs_baseline = master_test(data_model_baseline, epoch=test_epoch, save_file_suffix="base") basin_area_baseline = data_model_baseline.data_source.read_attr( data_model_baseline.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep_baseline = data_model_baseline.data_source.read_attr( data_model_baseline.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep_baseline = mean_prep_baseline / 365 * 10 pred_baseline = _basin_norm(pred_baseline, basin_area_baseline, mean_prep_baseline, to_norm=False) obs_baseline = _basin_norm(obs_baseline, basin_area_baseline, mean_prep_baseline, to_norm=False) save_result( data_model_baseline.data_source.data_config.data_path['Temp'], test_epoch, pred_baseline, obs_baseline, pred_name='flow_pred_base', obs_name='flow_obs_base') pred, obs = master_test(data_model, epoch=test_epoch) basin_area = data_model.data_source.read_attr( data_model.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep = data_model.data_source.read_attr( data_model.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep = mean_prep / 365 * 10 pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False) obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False) save_result(data_model.data_source.data_config.data_path['Temp'], test_epoch, pred, obs) if plus >= 0: pred_2, obs_2 = master_test(data_model_2, epoch=test_epoch, save_file_suffix="2") basin_area_2 = data_model_2.data_source.read_attr( data_model_2.t_s_dict["sites_id"], ['DRAIN_SQKM'], is_return_dict=False) mean_prep_2 = data_model_2.data_source.read_attr( data_model_2.t_s_dict["sites_id"], ['PPTAVG_BASIN'], is_return_dict=False) mean_prep_2 = mean_prep_2 / 365 * 10 pred_2 = _basin_norm(pred_2, basin_area_2, mean_prep_2, to_norm=False) obs_2 = _basin_norm(obs_2, basin_area_2, mean_prep_2, to_norm=False) save_result( data_model_2.data_source.data_config.data_path['Temp'], test_epoch, pred_2, obs_2, pred_name='flow_pred_2', obs_name='flow_obs_2')
var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( all_config_Data, data_model_train, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( all_config_Data, data_model_test, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json',
def test_some_reservoirs(self): """choose some small reservoirs randomly to train and test""" # 读取模型配置文件 config_data = self.config_data_1 # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management" dor = -0.02 source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], DOR=dor) sites_id = source_data.all_configs['flow_screen_gage_id'] # data1 is historical data as input of LSTM-Inv, which will be a kernel for the second LSTM quick_data_dir = os.path.join(self.config_data_1.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0") # for inv model, datamodel of train and test are same data_model_8595 = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') # for 2nd model, datamodel of train and test belong to parts of the test time data_model_9505 = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"] t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"] gages_model1_train = GagesModel.update_data_model( self.config_data_1, data_model_8595, sites_id_update=sites_id, t_range_update=t_range1_train, data_attr_update=True) # Because we know data of period "90-95", so that we can get its statistics according to this period gages_model1_test = GagesModel.update_data_model( self.config_data_1, data_model_8595, sites_id_update=sites_id, t_range_update=t_range1_test, data_attr_update=True) t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"] t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"] gages_model2_train = GagesModel.update_data_model( self.config_data_2, data_model_8595, sites_id_update=sites_id, t_range_update=t_range2_train, data_attr_update=True) gages_model2_test = GagesModel.update_data_model( self.config_data_2, data_model_9505, sites_id_update=sites_id, t_range_update=t_range2_test, data_attr_update=True, train_stat_dict=gages_model2_train.stat_dict) save_datamodel(gages_model1_train, "1", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model1_test, "1", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model2_train, "2", data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model2_test, "2", data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_some_reservoirs(self): """choose some small reservoirs to train and test""" # 读取模型配置文件 config_data = self.config_data # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management" dor = -0.02 # meaning dor < 0.02 source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) sites_id = source_data.all_configs['flow_screen_gage_id'] quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_model_train = GagesModel.update_data_model( self.config_data, data_model_train, sites_id_update=sites_id, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test = GagesModel.update_data_model( self.config_data, data_model_test, sites_id_update=sites_id, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') print("read and save data model")
def test_purposes_seperate(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "allnonref-dam_95-05_nan-0.1_00-1.0") data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model = GagesModel.update_data_model(self.config_data, data_model_test) nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata") gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values()) gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst) purpose_regions = {} for i in range(gage_main_dam_purpose_unique.size): sites_id = [] for key, value in gage_main_dam_purpose.items(): if value == gage_main_dam_purpose_unique[i]: sites_id.append(key) assert (all(x < y for x, y in zip(sites_id, sites_id[1:]))) purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id id_regions_idx = [] id_regions_sites_ids = [] df_id_region = np.array(data_model.t_s_dict["sites_id"]) for key, value in purpose_regions.items(): gages_id = value c, ind1, ind2 = np.intersect1d(df_id_region, gages_id, return_indices=True) assert (all(x < y for x, y in zip(ind1, ind1[1:]))) assert (all(x < y for x, y in zip(c, c[1:]))) id_regions_idx.append(ind1) id_regions_sites_ids.append(c) pred_all, obs_all = load_result(self.config_data.data_path["Temp"], self.test_epoch) pred_all = pred_all.reshape(pred_all.shape[0], pred_all.shape[1]) obs_all = obs_all.reshape(obs_all.shape[0], obs_all.shape[1]) for i in range(9, len(gage_main_dam_purpose_unique)): pred = pred_all[id_regions_idx[i], :] obs = obs_all[id_regions_idx[i], :] inds = statError(obs, pred) inds['STAID'] = id_regions_sites_ids[i] inds_df = pd.DataFrame(inds) inds_df.to_csv( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + 'data_df.csv')) # plot box,使用seaborn库 keys = ["Bias", "RMSE", "NSE"] inds_test = subset_of_dict(inds, keys) box_fig = plot_diff_boxes(inds_test) box_fig.savefig( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "box_fig.png")) # plot ts sites = np.array(df_id_region[id_regions_idx[i]]) t_range = np.array(data_model.t_s_dict["t_final_range"]) show_me_num = 1 ts_fig = plot_ts_obs_pred(obs, pred, sites, t_range, show_me_num) ts_fig.savefig( os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "ts_fig.png")) # plot nse ecdf sites_df_nse = pd.DataFrame({ "sites": sites, keys[2]: inds_test[keys[2]] }) plot_ecdf( sites_df_nse, keys[2], os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "ecdf_fig.png")) # plot map gauge_dict = data_model.data_source.gage_dict save_map_file = os.path.join( self.config_data.data_path["Out"], gage_main_dam_purpose_unique[i] + "epoch" + str(self.test_epoch) + "map_fig.png") plot_map(gauge_dict, sites_df_nse, save_file=save_map_file, id_col="STAID", lon_col="LNG_GAGE", lat_col="LAT_GAGE")
def test_split_nomajordam_ecoregion(self): quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata") # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel(data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel(data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id = data_model_train.t_s_dict["sites_id"] nomajordam_source_data = GagesSource.choose_some_basins(self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=0) nomajordam_sites_id = nomajordam_source_data.all_configs['flow_screen_gage_id'] nomajordam_in_conus = np.intersect1d(conus_sites_id, nomajordam_sites_id) majordam_source_data = GagesSource.choose_some_basins(self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, major_dam_num=[1, 2000]) majordam_sites_id = majordam_source_data.all_configs['flow_screen_gage_id'] majordam_in_conus = np.intersect1d(conus_sites_id, majordam_sites_id) sites_lst_train = [] sites_lst_test_nomajordam = [] sites_lst_test_majordam = [] random_seed = 1 np.random.seed(random_seed) kf = KFold(n_splits=self.split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in self.eco_names: eco_source_data = GagesSource.choose_some_basins(self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs['flow_screen_gage_id'] nomajordam_sites_id_inter = np.intersect1d(nomajordam_in_conus, eco_sites_id) majordam_sites_id_inter = np.intersect1d(majordam_in_conus, eco_sites_id) if nomajordam_sites_id_inter.size < majordam_sites_id_inter.size: if nomajordam_sites_id_inter.size < self.split_num: continue for train, test in kf.split(nomajordam_sites_id_inter): sites_lst_train_nomajordam = nomajordam_sites_id_inter[train] sites_lst_test_nomajordam.append(nomajordam_sites_id_inter[test]) majordam_chosen_lst = random_choice_no_return(majordam_sites_id_inter, [train.size, test.size]) sites_lst_train_majordam = majordam_chosen_lst[0] sites_lst_test_majordam.append(majordam_chosen_lst[1]) sites_lst_train.append(np.sort(np.append(sites_lst_train_nomajordam, sites_lst_train_majordam))) else: if majordam_sites_id_inter.size < self.split_num: continue for train, test in kf.split(majordam_sites_id_inter): sites_lst_train_majordam = majordam_sites_id_inter[train] sites_lst_test_majordam.append(majordam_sites_id_inter[test]) nomajordam_chosen_lst = random_choice_no_return(nomajordam_sites_id_inter, [train.size, test.size]) sites_lst_train_nomajordam = nomajordam_chosen_lst[0] sites_lst_test_nomajordam.append(nomajordam_chosen_lst[1]) sites_lst_train.append(np.sort(np.append(sites_lst_train_nomajordam, sites_lst_train_majordam))) eco_name_chosen.append(eco_name) for i in range(self.split_num): sites_ids_train_ilst = [sites_lst_train[j] for j in range(len(sites_lst_train)) if j % self.split_num == i] sites_ids_train_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst)) sites_ids_test_ilst = [sites_lst_test_nomajordam[j] for j in range(len(sites_lst_test_nomajordam)) if j % self.split_num == i] sites_ids_test_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst)) sites_ids_test_majordam_ilst = [sites_lst_test_majordam[j] for j in range(len(sites_lst_test_majordam)) if j % self.split_num == i] sites_ids_test_majordam_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_majordam_ilst)) subdir_i = os.path.join(self.subdir, str(i)) config_data_i = GagesConfig.set_subdir(self.config_file, subdir_i) gages_model_train_i = GagesModel.update_data_model(config_data_i, data_model_train, sites_id_update=sites_ids_train_i, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_i = GagesModel.update_data_model(config_data_i, data_model_test, sites_id_update=sites_ids_test_i, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) gages_model_test_majordam_i = GagesModel.update_data_model(config_data_i, data_model_test, sites_id_update=sites_ids_test_majordam_i, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_i, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_i, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') save_datamodel(gages_model_test_majordam_i, data_source_file_name='test_data_source_majordam.txt', stat_file_name='test_Statistics_majordam.json', flow_file_name='test_flow_majordam', forcing_file_name='test_forcing_majordam', attr_file_name='test_attr_majordam', f_dict_file_name='test_dictFactorize_majordam.json', var_dict_file_name='test_dictAttribute_majordam.json', t_s_dict_file_name='test_dictTimeSpace_majordam.json') print("save ecoregion " + str(i) + " data model")
def test_split_dor(self): pub_plan = self.pub_plan config_file = self.config_file config_data = self.config_data plus = self.plus random_seed = self.random_seed split_num = self.split_num eco_names = self.eco_names dor = self.dor quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata") data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0") data_model_train = GagesModel.load_datamodel( data_dir, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') data_model_test = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') conus_sites_id = data_model_train.t_s_dict["sites_id"] if pub_plan == 0: """do a pub test like freddy's""" camels531_gageid_file = os.path.join(config_data.data_path["DB"], "camels531", "camels531.txt") gauge_df = pd.read_csv(camels531_gageid_file, dtype={"GaugeID": str}) gauge_list = gauge_df["GaugeID"].values all_sites_camels_531 = np.sort( [str(gauge).zfill(8) for gauge in gauge_list]) sites_id_train = np.intersect1d(conus_sites_id, all_sites_camels_531) # basins not in CAMELS sites_id_test = [ a_temp_site for a_temp_site in conus_sites_id if a_temp_site not in all_sites_camels_531 ] assert (all(x < y for x, y in zip(sites_id_test, sites_id_test[1:]))) elif pub_plan == 1 or pub_plan == 4: source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=-dor) # basins with dams source_data_withdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=[1, 100000]) # basins without dams source_data_withoutdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=0) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_withdams = source_data_withdams.all_configs[ 'flow_screen_gage_id'] if pub_plan == 1: sites_id_train = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] sites_id_test = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() else: sites_id_train = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() sites_id_test = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] elif pub_plan == 2 or pub_plan == 5: source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor) # basins without dams source_data_withoutdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=0) if pub_plan == 2: sites_id_train = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] sites_id_test = source_data_dor1.all_configs[ 'flow_screen_gage_id'] else: sites_id_train = source_data_dor1.all_configs[ 'flow_screen_gage_id'] sites_id_test = source_data_withoutdams.all_configs[ 'flow_screen_gage_id'] elif pub_plan == 3 or pub_plan == 6: dor_1 = -dor dor_2 = dor source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_1) # basins with dams source_data_withdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=[1, 100000]) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_withdams = source_data_withdams.all_configs[ 'flow_screen_gage_id'] source_data_dor2 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_2) if pub_plan == 3: sites_id_train = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() sites_id_test = source_data_dor2.all_configs[ 'flow_screen_gage_id'] else: sites_id_train = source_data_dor2.all_configs[ 'flow_screen_gage_id'] sites_id_test = np.intersect1d( np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() else: print("wrong plan") sites_id_train = None sites_id_test = None train_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_train) test_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_test) if plus == 0: all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] all_index_lst_test_2 = [] sites_lst_test_2 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] train_sites_id_inter = np.intersect1d(train_sites_in_conus, eco_sites_id) test_sites_id_inter = np.intersect1d(test_sites_in_conus, eco_sites_id) if train_sites_id_inter.size < split_num or test_sites_id_inter.size < 1: continue for train, test in kf.split(train_sites_id_inter): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_id_inter[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_id_inter[test]) if test_sites_id_inter.size < test.size: all_index_lst_test_2.append( np.arange(test_sites_id_inter.size)) sites_lst_test_2.append(test_sites_id_inter) else: test2_chosen_idx = np.random.choice( test_sites_id_inter.size, test.size, replace=False) all_index_lst_test_2.append(test2_chosen_idx) sites_lst_test_2.append( test_sites_id_inter[test2_chosen_idx]) eco_name_chosen.append(eco_name) elif plus == -1: print("camels pub, only do pub on the camels basins") all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] train_sites_id_inter = np.intersect1d(train_sites_in_conus, eco_sites_id) if train_sites_id_inter.size < split_num: continue for train, test in kf.split(train_sites_id_inter): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_id_inter[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_id_inter[test]) eco_name_chosen.append(eco_name) elif plus == -2: print( "camels pub, only do pub on the camels basins, same with freddy's split method" ) all_index_lst_train_1 = [] # all sites come from train1 dataset sites_lst_train = [] all_index_lst_test_1 = [] sites_lst_test_1 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) for train, test in kf.split(train_sites_in_conus): all_index_lst_train_1.append(train) sites_lst_train.append(train_sites_in_conus[train]) all_index_lst_test_1.append(test) sites_lst_test_1.append(train_sites_in_conus[test]) else: sites_lst_train = [] sites_lst_test_1 = [] sites_lst_test_2 = [] np.random.seed(random_seed) kf = KFold(n_splits=split_num, shuffle=True, random_state=random_seed) eco_name_chosen = [] for eco_name in eco_names: eco_source_data = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, ecoregion=eco_name) eco_sites_id = eco_source_data.all_configs[ 'flow_screen_gage_id'] sites_id_inter_1 = np.intersect1d(train_sites_in_conus, eco_sites_id) sites_id_inter_2 = np.intersect1d(test_sites_in_conus, eco_sites_id) if sites_id_inter_1.size < sites_id_inter_2.size: if sites_id_inter_1.size < split_num: continue for train, test in kf.split(sites_id_inter_1): sites_lst_train_1 = sites_id_inter_1[train] sites_lst_test_1.append(sites_id_inter_1[test]) chosen_lst_2 = random_choice_no_return( sites_id_inter_2, [train.size, test.size]) sites_lst_train_2 = chosen_lst_2[0] sites_lst_test_2.append(chosen_lst_2[1]) sites_lst_train.append( np.sort( np.append(sites_lst_train_1, sites_lst_train_2))) else: if sites_id_inter_2.size < split_num: continue for train, test in kf.split(sites_id_inter_2): sites_lst_train_2 = sites_id_inter_2[train] sites_lst_test_2.append(sites_id_inter_2[test]) chosen_lst_1 = random_choice_no_return( sites_id_inter_1, [train.size, test.size]) sites_lst_train_1 = chosen_lst_1[0] sites_lst_test_1.append(chosen_lst_1[1]) sites_lst_train.append( np.sort( np.append(sites_lst_train_1, sites_lst_train_2))) eco_name_chosen.append(eco_name) for i in range(split_num): sites_ids_train_ilst = [ sites_lst_train[j] for j in range(len(sites_lst_train)) if j % split_num == i ] sites_ids_train_i = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst)) sites_ids_test_ilst_1 = [ sites_lst_test_1[j] for j in range(len(sites_lst_test_1)) if j % split_num == i ] sites_ids_test_i_1 = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_1)) if plus >= 0: sites_ids_test_ilst_2 = [ sites_lst_test_2[j] for j in range(len(sites_lst_test_2)) if j % split_num == i ] sites_ids_test_i_2 = np.sort( reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_2)) config_data_i = GagesConfig.set_subdir(config_file, str(i)) gages_model_train_i = GagesModel.update_data_model( config_data_i, data_model_train, sites_id_update=sites_ids_train_i, data_attr_update=True, screen_basin_area_huc4=False) gages_model_test_baseline_i = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_train_i, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) gages_model_test_i_1 = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_test_i_1, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) if plus >= 0: gages_model_test_i_2 = GagesModel.update_data_model( config_data_i, data_model_test, sites_id_update=sites_ids_test_i_2, data_attr_update=True, train_stat_dict=gages_model_train_i.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_train_i, data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing', attr_file_name='attr', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') save_datamodel(gages_model_test_baseline_i, data_source_file_name='test_data_source_base.txt', stat_file_name='test_Statistics_base.json', flow_file_name='test_flow_base', forcing_file_name='test_forcing_base', attr_file_name='test_attr_base', f_dict_file_name='test_dictFactorize_base.json', var_dict_file_name='test_dictAttribute_base.json', t_s_dict_file_name='test_dictTimeSpace_base.json') save_datamodel(gages_model_test_i_1, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') if plus >= 0: save_datamodel(gages_model_test_i_2, data_source_file_name='test_data_source_2.txt', stat_file_name='test_Statistics_2.json', flow_file_name='test_flow_2', forcing_file_name='test_forcing_2', attr_file_name='test_attr_2', f_dict_file_name='test_dictFactorize_2.json', var_dict_file_name='test_dictAttribute_2.json', t_s_dict_file_name='test_dictTimeSpace_2.json') print("save ecoregion " + str(i) + " data model")