Exemple #1
0
 def load_datamodel(cls, dir_temp_orgin, num_str=None, **kwargs):
     if num_str:
         dir_temp = os.path.join(dir_temp_orgin, num_str)
     else:
         dir_temp = dir_temp_orgin
     data_source_file = os.path.join(dir_temp,
                                     kwargs['data_source_file_name'])
     stat_file = os.path.join(dir_temp, kwargs['stat_file_name'])
     flow_npy_file = os.path.join(dir_temp, kwargs['flow_file_name'])
     forcing_npy_file = os.path.join(dir_temp, kwargs['forcing_file_name'])
     attr_npy_file = os.path.join(dir_temp, kwargs['attr_file_name'])
     f_dict_file = os.path.join(dir_temp, kwargs['f_dict_file_name'])
     var_dict_file = os.path.join(dir_temp, kwargs['var_dict_file_name'])
     t_s_dict_file = os.path.join(dir_temp, kwargs['t_s_dict_file_name'])
     source_data = unserialize_pickle(data_source_file)
     # save data_model because of the low speed of serialization of data_model: dict -> json,data -> npy
     stat_dict = unserialize_json(stat_file)
     data_flow = unserialize_numpy(flow_npy_file)
     data_forcing = unserialize_numpy(forcing_npy_file)
     data_attr = unserialize_numpy(attr_npy_file)
     # dictFactorize.json is the explanation of value of categorical variables
     var_dict = unserialize_json(var_dict_file)
     f_dict = unserialize_json(f_dict_file)
     t_s_dict = unserialize_json(t_s_dict_file)
     data_model = cls(source_data, data_flow, data_forcing, data_attr,
                      var_dict, f_dict, stat_dict, t_s_dict)
     return data_model
Exemple #2
0
    def test_plot_map(self):
        data_model = GagesModel.load_datamodel(
            self.dir_temp,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        gauge_dict = data_model.data_source.gage_dict
        t_s_dict = unserialize_json(self.t_s_dict_file)
        sites = np.array(t_s_dict["sites_id"])
        keys = ["NSE"]
        inds_test = subset_of_dict(self.inds, keys)
        sites_df = pd.DataFrame({"sites": sites, keys[0]: inds_test[keys[0]]})

        nse_range = [0, 1]
        idx_lstl_nse = sites_df[(sites_df[keys[0]] >= nse_range[0]) & (
            sites_df[keys[0]] <= nse_range[1])].index.tolist()
        colorbar_size = [0.91, 0.323, 0.02, 0.346]
        # colorbar_size = None
        plot_gages_map(data_model,
                       sites_df,
                       keys[0],
                       idx_lstl_nse,
                       colorbar_size=colorbar_size,
                       cbar_font_size=14)
        plt.savefig(os.path.join(self.dir_out, 'map_NSE.png'),
                    dpi=500,
                    bbox_inches="tight")
        plt.show()
Exemple #3
0
    def test_dam_train(self):
        with torch.cuda.device(0):
            quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                          "quickdata")
            data_dir = os.path.join(quick_data_dir,
                                    "allnonref_85-05_nan-0.1_00-1.0")
            data_model_8595 = GagesModel.load_datamodel(
                data_dir,
                data_source_file_name='data_source.txt',
                stat_file_name='Statistics.json',
                flow_file_name='flow.npy',
                forcing_file_name='forcing.npy',
                attr_file_name='attr.npy',
                f_dict_file_name='dictFactorize.json',
                var_dict_file_name='dictAttribute.json',
                t_s_dict_file_name='dictTimeSpace.json')

            gages_model_train = GagesModel.update_data_model(
                self.config_data, data_model_8595)
            nid_dir = os.path.join(
                "/".join(self.config_data.data_path["DB"].split("/")[:-1]),
                "nid", "quickdata")
            nid_input = NidModel.load_nidmodel(
                nid_dir,
                nid_file=self.nid_file,
                nid_source_file_name='nid_source.txt',
                nid_data_file_name='nid_data.shp')
            gage_main_dam_purpose = unserialize_json(
                os.path.join(nid_dir, "dam_main_purpose_dict.json"))
            data_input = GagesDamDataModel(gages_model_train, nid_input, True,
                                           gage_main_dam_purpose)
            gages_input = choose_which_purpose(data_input)
            master_train(gages_input)
Exemple #4
0
 def test_dam_train(self):
     """just test for one purpose as a case"""
     with torch.cuda.device(2):
         quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                       "quickdata")
         data_dir = os.path.join(quick_data_dir,
                                 "conus-all_90-10_nan-0.0_00-1.0")
         df = GagesModel.load_datamodel(
             data_dir,
             data_source_file_name='data_source.txt',
             stat_file_name='Statistics.json',
             flow_file_name='flow.npy',
             forcing_file_name='forcing.npy',
             attr_file_name='attr.npy',
             f_dict_file_name='dictFactorize.json',
             var_dict_file_name='dictAttribute.json',
             t_s_dict_file_name='dictTimeSpace.json')
         nid_dir = os.path.join(
             "/".join(self.config_data.data_path["DB"].split("/")[:-1]),
             "nid", "quickdata")
         nid_input = NidModel.load_nidmodel(
             nid_dir,
             nid_file=self.nid_file,
             nid_source_file_name='nid_source.txt',
             nid_data_file_name='nid_data.shp')
         gage_main_dam_purpose = unserialize_json(
             os.path.join(nid_dir, "dam_main_purpose_dict.json"))
         data_input = GagesDamDataModel(df, nid_input, True,
                                        gage_main_dam_purpose)
         purpose_chosen = 'C'
         gages_input = choose_which_purpose(data_input,
                                            purpose=purpose_chosen)
         master_train(gages_input)
Exemple #5
0
 def test_plot_cases(self):
     nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
     gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     for i in range(0, gage_main_dam_purpose_unique.size):
         data_model = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i],
                                                data_source_file_name='test_data_source.txt',
                                                stat_file_name='test_Statistics.json',
                                                flow_file_name='test_flow.npy',
                                                forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy',
                                                f_dict_file_name='test_dictFactorize.json',
                                                var_dict_file_name='test_dictAttribute.json',
                                                t_s_dict_file_name='test_dictTimeSpace.json')
         new_temp_dir = os.path.join(data_model.data_source.data_config.model_dict["dir"]["Temp"],
                                     gage_main_dam_purpose_unique[i])
         new_out_dir = os.path.join(data_model.data_source.data_config.model_dict["dir"]["Out"],
                                    gage_main_dam_purpose_unique[i])
         data_model.update_datamodel_dir(new_temp_dir, new_out_dir)
         pred, obs = load_result(new_temp_dir, self.test_epoch)
         pred = pred.reshape(pred.shape[0], pred.shape[1])
         obs = obs.reshape(pred.shape[0], pred.shape[1])
         inds = statError(obs, pred)
         inds_df = pd.DataFrame(inds)
         print(gage_main_dam_purpose_unique[i])
         print(inds_df.median(axis=0))
         print(inds_df.mean(axis=0))
Exemple #6
0
    def test_data_temp_test_damcls(self):

        with torch.cuda.device(0):
            nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
            gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
            gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
            gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
            for i in range(0, gage_main_dam_purpose_unique.size):
                df = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i],
                                               data_source_file_name='test_data_source.txt',
                                               stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy',
                                               forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy',
                                               f_dict_file_name='test_dictFactorize.json',
                                               var_dict_file_name='test_dictAttribute.json',
                                               t_s_dict_file_name='test_dictTimeSpace.json')
                new_temp_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Temp"],
                                            gage_main_dam_purpose_unique[i])
                new_out_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Out"],
                                           gage_main_dam_purpose_unique[i])
                df.update_datamodel_dir(new_temp_dir, new_out_dir)
                pred, obs = master_test(df, epoch=self.test_epoch)
                basin_area = df.data_source.read_attr(df.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                                                      is_return_dict=False)
                mean_prep = df.data_source.read_attr(df.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                                                     is_return_dict=False)
                mean_prep = mean_prep / 365 * 10
                pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
                obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
                save_result(new_temp_dir, self.test_epoch, pred, obs)
Exemple #7
0
 def test_plot_ts(self):
     """测试可视化代码"""
     # plot time series
     show_me_num = 5
     t_s_dict = unserialize_json(self.t_s_dict_file)
     sites = np.array(t_s_dict["sites_id"])
     t_range = np.array(t_s_dict["t_final_range"])
     plot_ts_obs_pred(self.obs, self.pred, sites, t_range, show_me_num)
Exemple #8
0
 def test_plot_ind_map(self):
     """plot nse value on map"""
     t_s_dict = unserialize_json(self.t_s_dict_file)
     sites = np.array(t_s_dict["sites_id"])
     keys = ["NSE"]
     inds_test = subset_of_dict(self.inds, keys)
     # concat sites and inds
     sites_df = pd.DataFrame({"sites": sites, keys[0]: inds_test[keys[0]]})
     plot_ind_map(self.gage_point_file, sites_df)
Exemple #9
0
 def test_plot_kuai_cdf(self):
     t_s_dict = unserialize_json(self.t_s_dict_file)
     sites = np.array(t_s_dict["sites_id"])
     keys = ["NSE"]
     inds_test = subset_of_dict(self.inds, keys)
     plotCDF([inds_test[keys[0]]],
             ref=None,
             legendLst=["LSTM"],
             linespec=['-', '-', ':', ':', ':'])
Exemple #10
0
 def test_plot_pdf_cdf(self):
     t_s_dict = unserialize_json(self.t_s_dict_file)
     sites = np.array(t_s_dict["sites_id"])
     keys = ["NSE"]
     inds_test = subset_of_dict(self.inds, keys)
     x = pd.DataFrame(inds_test)
     # x = inds_test[keys[0]]
     # plot_dist(x)
     plot_pdf_cdf(x, keys[0])
Exemple #11
0
    def test_dam_test(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data, data_model_train)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(gages_model_test, nid_input, True,
                                       gage_main_dam_purpose)
        gages_input = choose_which_purpose(data_input)
        pred, obs = master_test(gages_input)
        basin_area = gages_input.data_source.read_attr(
            gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'],
            is_return_dict=False)
        mean_prep = gages_input.data_source.read_attr(
            gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
            is_return_dict=False)
        mean_prep = mean_prep / 365 * 10
        pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
        obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
        save_result(gages_input.data_source.data_config.data_path['Temp'],
                    self.test_epoch, pred, obs)
Exemple #12
0
 def test_forecast(self):
     source_data = unserialize_pickle(self.data_source_test_file)
     # 存储data_model,因为data_model里的数据如果直接序列化会比较慢,所以各部分分别序列化,dict的直接序列化为json文件,数据的HDF5
     stat_dict = unserialize_json(self.stat_file)
     data_flow = unserialize_numpy(self.flow_npy_file)
     data_forcing = unserialize_numpy(self.forcing_npy_file)
     data_attr = unserialize_numpy(self.attr_npy_file)
     # dictFactorize.json is the explanation of value of categorical variables
     var_dict = unserialize_json(self.var_dict_file)
     f_dict = unserialize_json(self.f_dict_file)
     t_s_dict = unserialize_json(self.t_s_dict_file)
     data_model_test = DataModel(source_data, data_flow, data_forcing,
                                 data_attr, var_dict, f_dict, stat_dict,
                                 t_s_dict)
     pred, obs = hydroDL.master_test(data_model_test)
     print(pred)
     print(obs)
     serialize_numpy(pred, self.flow_pred_file)
     serialize_numpy(obs, self.flow_obs_file)
Exemple #13
0
 def test_purposes_inds(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref-dam_95-05_nan-0.1_00-1.0")
     data_model = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     gages_data_model = GagesModel.update_data_model(
         self.config_data, data_model)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     purpose_regions = {}
     for i in range(gage_main_dam_purpose_unique.size):
         sites_id = []
         for key, value in gage_main_dam_purpose.items():
             if value == gage_main_dam_purpose_unique[i]:
                 sites_id.append(key)
         assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
         purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
     id_regions_idx = []
     id_regions_sites_ids = []
     df_id_region = np.array(gages_data_model.t_s_dict["sites_id"])
     for key, value in purpose_regions.items():
         gages_id = value
         c, ind1, ind2 = np.intersect1d(df_id_region,
                                        gages_id,
                                        return_indices=True)
         assert (all(x < y for x, y in zip(ind1, ind1[1:])))
         assert (all(x < y for x, y in zip(c, c[1:])))
         id_regions_idx.append(ind1)
         id_regions_sites_ids.append(c)
     preds, obss, inds_dfs = split_results_to_regions(
         gages_data_model, self.test_epoch, id_regions_idx,
         id_regions_sites_ids)
     region_names = list(purpose_regions.keys())
     inds_medians = []
     inds_means = []
     for i in range(len(region_names)):
         inds_medians.append(inds_dfs[i].median(axis=0))
         inds_means.append(inds_dfs[i].mean(axis=0))
     print(inds_medians)
     print(inds_means)
Exemple #14
0
    def test_dam_train(self):
        quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "allnonref_85-05_nan-0.1_00-1.0")
        # for inv model, datamodel of  train and test are same
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"]
        gages_model1_train = GagesModel.update_data_model(
            self.config_data_1,
            data_model_8595,
            t_range_update=t_range1_train,
            data_attr_update=True)
        t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
        gages_model2_train = GagesModel.update_data_model(
            self.config_data_2,
            data_model_8595,
            t_range_update=t_range2_train,
            data_attr_update=True)
        nid_dir = os.path.join(
            "/".join(self.config_data_1.data_path["DB"].split("/")[:-1]),
            "nid", "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)

        with torch.cuda.device(1):
            for i in range(0, gage_main_dam_purpose_unique.size):
                data_input1 = GagesDamDataModel(gages_model1_train, nid_input,
                                                True, gage_main_dam_purpose)
                gages_input1 = choose_which_purpose(
                    data_input1, purpose=gage_main_dam_purpose_unique[i])
                data_input2 = GagesDamDataModel(gages_model2_train, nid_input,
                                                True, gage_main_dam_purpose)
                gages_input2 = choose_which_purpose(
                    data_input2, purpose=gage_main_dam_purpose_unique[i])
                data_model = GagesInvDataModel(gages_input1, gages_input2)
                # pre_trained_model_epoch = 165
                train_lstm_inv(data_model)
Exemple #15
0
    def test_gages_dam_all_save(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data, data_model_train)
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(gages_model_test, nid_input,
                                       gage_main_dam_purpose)
        data_model_dam = choose_which_purpose(data_input)
        save_datamodel(data_model_dam,
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
Exemple #16
0
 def test_explore_damcls_datamodel(self):
     config_data = self.config_data
     sites_id_dict = unserialize_json(
         "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/data/gages/nid/test/dam_main_purpose_dict.json")
     sites_id = list(sites_id_dict.keys())
     source_data_dor1 = GagesSource.choose_some_basins(config_data,
                                                       config_data.model_dict["data"]["tRangeTrain"],
                                                       screen_basin_area_huc4=False,
                                                       sites_id=sites_id)
     norsto = source_data_dor1.read_attr(sites_id, ["STOR_NOR_2009"], is_return_dict=False)
     df = pd.DataFrame({"GAGE_ID": sites_id, "STOR_NOR": norsto.flatten()})
     # df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '3557basins_NORSTOR.csv'),
     #           quoting=csv.QUOTE_NONNUMERIC, index=None)
     df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '2909basins_NORSTOR.csv'),
               quoting=csv.QUOTE_NONNUMERIC, index=None)
Exemple #17
0
 def test_explore_(self):
     config_data = self.config_data
     sites_id_dict = unserialize_json(
         "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/data/gages/nid/test/dam_main_purpose_dict.json")
     sites_id = list(sites_id_dict.keys())
     source_data_dor1 = GagesSource.choose_some_basins(config_data,
                                                       config_data.model_dict["data"]["tRangeTrain"],
                                                       screen_basin_area_huc4=False,
                                                       sites_id=sites_id)
     nse_all = pd.read_csv(
         "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/output/gages/basic/exp37/3557basins_ID_NSE_DOR.csv",
         dtype={0: str})
     sites_ids = nse_all["GAUGE ID"].values
     idx = [i for i in range(len(sites_ids)) if sites_ids[i] in sites_id]
     df = pd.DataFrame({"GAGE_ID": sites_id, "NSE": nse_all["NSE"].values[idx]})
     # df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '3557basins_NORSTOR.csv'),
     #           quoting=csv.QUOTE_NONNUMERIC, index=None)
     df.to_csv(os.path.join(source_data_dor1.all_configs["out_dir"], '2909basins_NSE.csv'),
               quoting=csv.QUOTE_NONNUMERIC, index=None)
Exemple #18
0
 def test_dam_train(self):
     with torch.cuda.device(0):
         nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
         gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
         gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
         gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
         for i in range(0, gage_main_dam_purpose_unique.size):
             df = GagesModel.load_datamodel(self.config_data.data_path["Temp"], gage_main_dam_purpose_unique[i],
                                            data_source_file_name='data_source.txt',
                                            stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                            forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                            f_dict_file_name='dictFactorize.json',
                                            var_dict_file_name='dictAttribute.json',
                                            t_s_dict_file_name='dictTimeSpace.json')
             new_temp_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Temp"],
                                         gage_main_dam_purpose_unique[i])
             new_out_dir = os.path.join(df.data_source.data_config.model_dict["dir"]["Out"],
                                        gage_main_dam_purpose_unique[i])
             df.update_datamodel_dir(new_temp_dir, new_out_dir)
             master_train(df)
Exemple #19
0
 def test_read_sites_id_see_dor(self):
     exp_lst = ["exp18", "exp19", "exp20", "exp21", "exp22", "exp23"]
     sub_lst = ["0", "1"]
     diff_lst = [
         "dictTimeSpace.json", "test_dictTimeSpace.json",
         "test_dictTimeSpace_2.json"
     ]
     for exp_str in exp_lst:
         for sub_str in sub_lst:
             comp_sites = []
             for item in diff_lst:
                 gage_id_file = os.path.join(
                     self.config_data.config_file["ROOT_DIR"], "temp",
                     "gages", "ecoregion", exp_str, sub_str, item)
                 usgs_id = unserialize_json(gage_id_file)["sites_id"]
                 assert (all(x < y for x, y in zip(usgs_id, usgs_id[1:])))
                 comp_sites.append(usgs_id)
                 # mm/year 1-km grid,  megaliters total storage per sq km  (1 megaliters = 1,000,000 liters = 1,000 cubic meters)
                 # attr_lst = ["RUNAVE7100", "STOR_NID_2009"]
                 attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
                 source_data = GagesSource.choose_some_basins(
                     self.config_data,
                     self.config_data.model_dict["data"]["tRangeTrain"],
                     screen_basin_area_huc4=False,
                     sites_id=usgs_id)
                 data_attr, var_dict, f_dict = source_data.read_attr(
                     usgs_id, attr_lst)
                 run_avg = data_attr[:, 0] * (10**(-3)) * (10**6
                                                           )  # m^3 per year
                 nor_storage = data_attr[:, 1] * 1000  # m^3
                 dors = nor_storage / run_avg
                 results = [round(i, 3) for i in dors]
                 hydro_logger.info(
                     exp_str + "-" + sub_str + "-" + item + " DOR: %s",
                     results)
             hydro_logger.info(
                 "the intersection of each pair of sites: %s, %s, %s",
                 np.intersect1d(comp_sites[0], comp_sites[1]),
                 np.intersect1d(comp_sites[0], comp_sites[2]),
                 np.intersect1d(comp_sites[1], comp_sites[2]))
Exemple #20
0
    def test_damcls_test_datamodel(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0")
        data_model_train = GagesModel.load_datamodel(data_dir,
                                                     data_source_file_name='data_source.txt',
                                                     stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                     forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                     f_dict_file_name='dictFactorize.json',
                                                     var_dict_file_name='dictAttribute.json',
                                                     t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train)
        df = GagesModel.update_data_model(self.config_data, data_model_test,
                                          train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
        nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file,
                                           nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        data_input = GagesDamDataModel(df, nid_input, True, gage_main_dam_purpose)
        for i in range(gage_main_dam_purpose_unique.size):
            gages_input = choose_which_purpose(data_input, purpose=gage_main_dam_purpose_unique[i])
            save_datamodel(gages_input, gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                           forcing_file_name='test_forcing', attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
Exemple #21
0
    def test_gages_dam_stor_hist_basin(self):
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        dam_storages = unserialize_json(
            os.path.join(nid_dir, "dam_storages_dict.json"))

        sites = np.array(list(dam_storages.keys()))

        dor_2 = 0.02
        source_data_dor2 = GagesSource.choose_some_basins(
            self.config_data,
            self.config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor_2)
        sites_id_largedam = source_data_dor2.all_configs['flow_screen_gage_id']
        c, ind1, idx_lst_nse_range = np.intersect1d(sites,
                                                    sites_id_largedam,
                                                    return_indices=True)

        num = 4
        num_lst = np.sort(np.random.choice(len(c), num, replace=False))
        chosen_sites = c[num_lst]
        hist_bins = 20

        fig = plt.figure(figsize=(8, 9))
        gs = gridspec.GridSpec(2, 2)

        for i in range(num):
            ax_k = plt.subplot(gs[int(i / 2), i % 2])
            ax_k.hist(dam_storages[chosen_sites[i]],
                      hist_bins,
                      orientation='vertical',
                      color='red',
                      alpha=0.5)
        plt.show()
Exemple #22
0
 def test_dam_test(self):
     quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     # for inv model, datamodel of  train and test are same
     data_model_8595 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     # for 2nd model, datamodel of train and test belong to parts of the test time
     data_model_9505 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"]
     # Because we know data of period "90-95", so that we can get its statistics according to this period
     gages_model1_test = GagesModel.update_data_model(
         self.config_data_1,
         data_model_8595,
         t_range_update=t_range1_test,
         data_attr_update=True)
     t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
     t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"]
     gages_model2_train = GagesModel.update_data_model(
         self.config_data_2,
         data_model_8595,
         t_range_update=t_range2_train,
         data_attr_update=True)
     gages_model2_test = GagesModel.update_data_model(
         self.config_data_2,
         data_model_9505,
         t_range_update=t_range2_test,
         data_attr_update=True,
         train_stat_dict=gages_model2_train.stat_dict)
     nid_dir = os.path.join(
         "/".join(self.config_data_2.data_path["DB"].split("/")[:-1]),
         "nid", "quickdata")
     nid_input = NidModel.load_nidmodel(
         nid_dir,
         nid_file=self.nid_file,
         nid_source_file_name='nid_source.txt',
         nid_data_file_name='nid_data.shp')
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     data_input1 = GagesDamDataModel(gages_model1_test, nid_input, True,
                                     gage_main_dam_purpose)
     df1 = choose_which_purpose(data_input1)
     data_input2 = GagesDamDataModel(gages_model2_test, nid_input, True,
                                     gage_main_dam_purpose)
     df2 = choose_which_purpose(data_input2)
     with torch.cuda.device(2):
         data_model = GagesInvDataModel(df1, df2)
         pred, obs = test_lstm_inv(data_model, epoch=self.test_epoch)
         basin_area = df2.data_source.read_attr(df2.t_s_dict["sites_id"],
                                                ['DRAIN_SQKM'],
                                                is_return_dict=False)
         mean_prep = df2.data_source.read_attr(df2.t_s_dict["sites_id"],
                                               ['PPTAVG_BASIN'],
                                               is_return_dict=False)
         mean_prep = mean_prep / 365 * 10
         pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
         obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
         save_result(df2.data_source.data_config.data_path['Temp'],
                     self.test_epoch, pred, obs)
Exemple #23
0
                                    flow_file_name='flow.npy',
                                    forcing_file_name='forcing.npy',
                                    attr_file_name='attr.npy',
                                    f_dict_file_name='dictFactorize.json',
                                    var_dict_file_name='dictAttribute.json',
                                    t_s_dict_file_name='dictTimeSpace.json')
     nid_input = NidModel(cfg)
     nid_dir = os.path.join(cfg.NID.NID_DIR, "test")
     save_nidinput(nid_input,
                   nid_dir,
                   nid_source_file_name='nid_source.txt',
                   nid_data_file_name='nid_data.shp')
     data_input = GagesDamDataModel(df, nid_input)
     serialize_json(data_input.gage_main_dam_purpose,
                    os.path.join(nid_dir, "dam_main_purpose_dict.json"))
 gage_main_dam_purpose = unserialize_json(nid_gene_file)
 gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
 gage_main_dam_purpose_lst_merge = "".join(gage_main_dam_purpose_lst)
 gage_main_dam_purpose_unique = np.unique(
     list(gage_main_dam_purpose_lst_merge))
 # gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
 purpose_regions = {}
 for i in range(gage_main_dam_purpose_unique.size):
     sites_id = []
     for key, value in gage_main_dam_purpose.items():
         if gage_main_dam_purpose_unique[i] in value:
             sites_id.append(key)
     assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
     purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
 id_regions_idx = []
 id_regions_sites_ids = []
Exemple #24
0
 def test_purposes_seperate(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref-dam_95-05_nan-0.1_00-1.0")
     data_model_test = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     data_model = GagesModel.update_data_model(self.config_data,
                                               data_model_test)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     purpose_regions = {}
     for i in range(gage_main_dam_purpose_unique.size):
         sites_id = []
         for key, value in gage_main_dam_purpose.items():
             if value == gage_main_dam_purpose_unique[i]:
                 sites_id.append(key)
         assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
         purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
     id_regions_idx = []
     id_regions_sites_ids = []
     df_id_region = np.array(data_model.t_s_dict["sites_id"])
     for key, value in purpose_regions.items():
         gages_id = value
         c, ind1, ind2 = np.intersect1d(df_id_region,
                                        gages_id,
                                        return_indices=True)
         assert (all(x < y for x, y in zip(ind1, ind1[1:])))
         assert (all(x < y for x, y in zip(c, c[1:])))
         id_regions_idx.append(ind1)
         id_regions_sites_ids.append(c)
     pred_all, obs_all = load_result(self.config_data.data_path["Temp"],
                                     self.test_epoch)
     pred_all = pred_all.reshape(pred_all.shape[0], pred_all.shape[1])
     obs_all = obs_all.reshape(obs_all.shape[0], obs_all.shape[1])
     for i in range(9, len(gage_main_dam_purpose_unique)):
         pred = pred_all[id_regions_idx[i], :]
         obs = obs_all[id_regions_idx[i], :]
         inds = statError(obs, pred)
         inds['STAID'] = id_regions_sites_ids[i]
         inds_df = pd.DataFrame(inds)
         inds_df.to_csv(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + 'data_df.csv'))
         # plot box,使用seaborn库
         keys = ["Bias", "RMSE", "NSE"]
         inds_test = subset_of_dict(inds, keys)
         box_fig = plot_diff_boxes(inds_test)
         box_fig.savefig(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "box_fig.png"))
         # plot ts
         sites = np.array(df_id_region[id_regions_idx[i]])
         t_range = np.array(data_model.t_s_dict["t_final_range"])
         show_me_num = 1
         ts_fig = plot_ts_obs_pred(obs, pred, sites, t_range, show_me_num)
         ts_fig.savefig(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "ts_fig.png"))
         # plot nse ecdf
         sites_df_nse = pd.DataFrame({
             "sites": sites,
             keys[2]: inds_test[keys[2]]
         })
         plot_ecdf(
             sites_df_nse, keys[2],
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "ecdf_fig.png"))
         # plot map
         gauge_dict = data_model.data_source.gage_dict
         save_map_file = os.path.join(
             self.config_data.data_path["Out"],
             gage_main_dam_purpose_unique[i] + "epoch" +
             str(self.test_epoch) + "map_fig.png")
         plot_map(gauge_dict,
                  sites_df_nse,
                  save_file=save_map_file,
                  id_col="STAID",
                  lon_col="LNG_GAGE",
                  lat_col="LAT_GAGE")
Exemple #25
0
    def test_gages_nse_dam_attr(self):
        figure_dpi = 600
        config_data = self.config_data
        data_dir = config_data.data_path["Temp"]
        data_model = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        gages_id = data_model.t_s_dict["sites_id"]

        exp_lst = [
            "basic_exp37", "basic_exp39", "basic_exp40", "basic_exp41",
            "basic_exp42", "basic_exp43"
        ]
        self.inds_df, pred_mean, obs_mean = load_ensemble_result(
            config_data.config_file,
            exp_lst,
            config_data.config_file.TEST_EPOCH,
            return_value=True)
        show_ind_key = 'NSE'

        plt.rcParams['font.family'] = 'serif'
        plt.rcParams['font.serif'] = ['Times New Roman'
                                      ] + plt.rcParams['font.serif']
        # plot NSE-DOR
        attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
        attrs_runavg_stor = data_model.data_source.read_attr(
            gages_id, attr_lst, is_return_dict=False)
        run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6
                                                          )  # m^3 per year
        nor_storage = attrs_runavg_stor[:, 1] * 1000  # m^3
        dors = nor_storage / run_avg
        # dor = 0 is not totally same with dam_num=0 (some dammed basins' dor is about 0.00),
        # here for zero-dor we mainly rely on dam_num = 0
        attr_dam_num = ["NDAMS_2009"]
        attrs_dam_num = data_model.data_source.read_attr(gages_id,
                                                         attr_dam_num,
                                                         is_return_dict=False)
        df = pd.DataFrame({
            "DOR": dors,
            "DAM_NUM": attrs_dam_num[:, 0],
            show_ind_key: self.inds_df[show_ind_key].values
        })
        hydro_logger.info("statistics of dors:\n %s", df.describe())
        hydro_logger.info("percentiles of dors:\n %s", df.quantile(q=0.95))
        hydro_logger.info("ecdf of dors:\n %s", ecdf(dors))

        # boxplot
        # add a column to represent the dor range for the df
        dor_value_range_lst = [[0, 0], [0, 0.02], [0.02, 0.05], [0.05, 0.1],
                               [0.1, 0.2], [0.2, 0.4], [0.4, 0.8],
                               [0.8, 10000]]
        dor_range_lst = ["0"] + [
            str(dor_value_range_lst[i][0]) + "-" +
            str(dor_value_range_lst[i][1])
            for i in range(1,
                           len(dor_value_range_lst) - 1)
        ] + [">" + str(dor_value_range_lst[-1][0])]

        # add a column to represent the dam_num range for the df
        dam_num_value_range_lst = [[0, 0], [0, 1], [1, 3], [3, 5], [5, 10],
                                   [10, 20], [20, 50], [50, 10000]]
        dam_num_range_lst = ["0", "1"] + [
            str(dam_num_value_range_lst[i][0]) + "-" +
            str(dam_num_value_range_lst[i][1])
            for i in range(2,
                           len(dam_num_value_range_lst) - 1)
        ] + [">" + str(dam_num_value_range_lst[-1][0])]

        def in_which_range(value_temp):
            if value_temp == 0:
                return "0"
            the_range = [
                a_range for a_range in dor_value_range_lst
                if a_range[0] < value_temp <= a_range[1]
            ]
            if the_range[0][0] == dor_value_range_lst[-1][0]:
                the_range_str = ">" + str(the_range[0][0])
            else:
                the_range_str = str(the_range[0][0]) + "-" + str(
                    the_range[0][1])
            return the_range_str

        def in_which_dam_num_range(value_tmp):
            if value_tmp == 0:
                return "0"
            if value_tmp == 1:
                return "1"
            the_ran = [
                a_ran for a_ran in dam_num_value_range_lst
                if a_ran[0] < value_tmp <= a_ran[1]
            ]
            if the_ran[0][0] == dam_num_value_range_lst[-1][0]:
                the_ran_str = ">" + str(the_ran[0][0])
            else:
                the_ran_str = str(the_ran[0][0]) + "-" + str(the_ran[0][1])
            return the_ran_str

        df["DOR_RANGE"] = df["DOR"].apply(in_which_range)
        df["DAM_NUM_RANGE"] = df["DAM_NUM"].apply(in_which_dam_num_range)
        df.loc[(df["DAM_NUM"] > 0) & (df["DOR_RANGE"] == "0"),
               "DOR_RANGE"] = dor_range_lst[1]
        shown_nse_range_boxplots = [-0.5, 1.0]
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        plot_boxs(df,
                  "DOR_RANGE",
                  show_ind_key,
                  ylim=shown_nse_range_boxplots,
                  order=dor_range_lst)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR-boxplots-' + str(shown_nse_range_boxplots) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")
        plt.figure()
        shown_nse_range_boxplots = [0, 1.0]
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        plot_boxs(df,
                  "DAM_NUM_RANGE",
                  show_ind_key,
                  ylim=shown_nse_range_boxplots,
                  order=dam_num_range_lst)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DAM_NUM-boxplots-' + str(shown_nse_range_boxplots) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")
        nums_in_dor_range = [
            df[df["DOR_RANGE"] == a_range_rmp].shape[0]
            for a_range_rmp in dor_range_lst
        ]
        ratios_in_dor_range = [
            a_num / df.shape[0] for a_num in nums_in_dor_range
        ]
        hydro_logger.info(
            "the number and ratio of basins in each dor range\n: %s \n %s",
            nums_in_dor_range, ratios_in_dor_range)

        nums_in_dam_num_range = [
            df[df["DAM_NUM_RANGE"] == a_range_rmp].shape[0]
            for a_range_rmp in dam_num_range_lst
        ]
        ratios_in_dam_num_range = [
            a_num / df.shape[0] for a_num in nums_in_dam_num_range
        ]
        hydro_logger.info(
            "the number and ratio of basins in each dam_num range\n: %s \n %s",
            nums_in_dam_num_range, ratios_in_dam_num_range)

        # regplot
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        sr = sns.regplot(x="DOR",
                         y=show_ind_key,
                         data=df[df[show_ind_key] >= 0],
                         scatter_kws={'s': 10})
        show_dor_max = df.quantile(
            q=0.95)["DOR"]  # 30  # max(dors)  # 0.8  # 10
        show_dor_min = min(dors)
        plt.ylim(0, 1)
        plt.xlim(show_dor_min, show_dor_max)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR-shown-max-' + str(show_dor_max) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        # jointplot
        # dor_range = [0.2, 0.9]
        dor_range = [0.002, 0.2]
        # plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="reg",
        #                   marginal_kws=dict(bins=25))
        # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="hex",
        #                   color="b", marginal_kws=dict(bins=50))
        g = sns.jointplot(
            x="DOR",
            y=show_ind_key,
            data=df[(df["DOR"] < dor_range[1]) & (df["DOR"] > dor_range[0]) &
                    (df[show_ind_key] >= 0)],
            kind="hex",
            color="b")
        g.ax_marg_x.set_xlim(dor_range[0], dor_range[1])
        # g.ax_marg_y.set_ylim(-0.5, 1)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR(range-)' + str(dor_range) + '-jointplot.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(data_model, nid_input,
                                       gage_main_dam_purpose)
        dam_coords = unserialize_json_ordered(
            os.path.join(nid_dir, "dam_points_dict.json"))
        dam_storages = unserialize_json_ordered(
            os.path.join(nid_dir, "dam_storages_dict.json"))
        dam_ids_1 = list(gage_main_dam_purpose.keys())
        dam_ids_2 = list(dam_coords.keys())
        dam_ids_3 = list(dam_storages.keys())
        assert (all(x < y for x, y in zip(dam_ids_1, dam_ids_1[1:])))
        assert (all(x < y for x, y in zip(dam_ids_2, dam_ids_2[1:])))
        assert (all(x < y for x, y in zip(dam_ids_3, dam_ids_3[1:])))

        sites = list(dam_coords.keys())
        c, ind1, idx_lst_nse_range = np.intersect1d(sites,
                                                    gages_id,
                                                    return_indices=True)

        std_storage_in_a_basin = list(map(np.std, dam_storages.values()))
        log_std_storage_in_a_basin = list(
            map(np.log,
                np.array(std_storage_in_a_basin) + 1))
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            "DAM_STORAGE_STD": log_std_storage_in_a_basin,
            show_ind_key: nse_values
        })
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.regplot(x="DAM_STORAGE_STD",
                        y=show_ind_key,
                        data=df[df[show_ind_key] >= 0],
                        scatter_kws={'s': 10})
        show_max = max(log_std_storage_in_a_basin)
        show_min = min(log_std_storage_in_a_basin)
        if show_min < 0:
            show_min = 0
        # g.ax_marg_x.set_xlim(show_min, show_max)
        # g.ax_marg_y.set_ylim(0, 1)
        plt.ylim(0, 1)
        plt.xlim(show_min, show_max)
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'NSE~' + "DAM_STORAGE_STD" + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        gages_loc_lat = data_model.data_source.gage_dict["LAT_GAGE"]
        gages_loc_lon = data_model.data_source.gage_dict["LNG_GAGE"]
        gages_loc = [[gages_loc_lat[i], gages_loc_lon[i]]
                     for i in range(len(gages_id))]
        # calculate index of dispersion, then plot the NSE-dispersion scatterplot
        # Geo coord system of gages_loc and dam_coords are both NAD83
        coefficient_of_var = list(
            map(coefficient_of_variation, gages_loc, dam_coords.values()))
        coefficient_of_var_min = min(coefficient_of_var)
        coefficient_of_var_max = max(coefficient_of_var)
        dispersion_var = "DAM_GAGE_DIS_VAR"
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            dispersion_var: coefficient_of_var,
            show_ind_key: nse_values
        })
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.regplot(x=dispersion_var,
                        y=show_ind_key,
                        data=df[df[show_ind_key] >= 0],
                        scatter_kws={'s': 10})
        show_max = coefficient_of_var_max
        show_min = coefficient_of_var_min
        if show_min < 0:
            show_min = 0
        # g.ax_marg_x.set_xlim(show_min, show_max)
        # g.ax_marg_y.set_ylim(0, 1)
        plt.ylim(0, 1)
        plt.xlim(show_min, show_max)
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'NSE~' + dispersion_var + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        idx_dispersions = list(
            map(ind_of_dispersion, gages_loc, dam_coords.values()))
        idx_dispersion_min = min(idx_dispersions)
        idx_dispersion_max = max(idx_dispersions)
        dispersion_var = "DAM_DISPERSION_BASIN"
        # nse_range = [0, 1]
        # idx_lst_nse_range = inds_df_now[(inds_df_now[show_ind_key] >= nse_range[0]) & (inds_df_now[show_ind_key] < nse_range[1])].index.tolist()
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            dispersion_var: idx_dispersions,
            show_ind_key: nse_values
        })
        # g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10})
        if idx_dispersion_min < 0:
            idx_dispersion_min = 0
        plt.ylim(0, 1)
        plt.xlim(idx_dispersion_min, idx_dispersion_max)
        # plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.jointplot(x=dispersion_var,
                          y=show_ind_key,
                          data=df[df[show_ind_key] >= 0],
                          kind="reg")
        g.ax_marg_x.set_xlim(idx_dispersion_min, idx_dispersion_max)
        g.ax_marg_y.set_ylim(0, 1)
        plt.show()
Exemple #26
0
 def test_dam_train(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     sim_data_dir = os.path.join(quick_data_dir,
                                 "allref_85-05_nan-0.1_00-1.0")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     data_model_sim8595 = GagesModel.load_datamodel(
         sim_data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     data_model_8595 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     sim_gages_model_train = GagesModel.update_data_model(
         self.sim_config_data, data_model_sim8595, data_attr_update=True)
     gages_model_train = GagesModel.update_data_model(self.config_data,
                                                      data_model_8595,
                                                      data_attr_update=True)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     nid_input = NidModel.load_nidmodel(
         nid_dir,
         nid_file=self.nid_file,
         nid_source_file_name='nid_source.txt',
         nid_data_file_name='nid_data.shp')
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     data_input = GagesDamDataModel(gages_model_train, nid_input, True,
                                    gage_main_dam_purpose)
     with torch.cuda.device(0):
         for i in range(0, gage_main_dam_purpose_unique.size):
             sim_gages_model_train.update_model_param('train', nEpoch=300)
             gages_input = choose_which_purpose(
                 data_input, purpose=gage_main_dam_purpose_unique[i])
             new_temp_dir = os.path.join(
                 gages_input.data_source.data_config.model_dict["dir"]
                 ["Temp"], gage_main_dam_purpose_unique[i])
             new_out_dir = os.path.join(
                 gages_input.data_source.data_config.model_dict["dir"]
                 ["Out"], gage_main_dam_purpose_unique[i])
             gages_input.update_datamodel_dir(new_temp_dir, new_out_dir)
             data_model = GagesSimDataModel(sim_gages_model_train,
                                            gages_input)
             # pre_trained_model_epoch = 25
             # master_train_natural_flow(data_model, pre_trained_model_epoch=pre_trained_model_epoch)
             master_train_natural_flow(data_model)
Exemple #27
0
    def test_dam_test(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        sim_data_dir = os.path.join(quick_data_dir,
                                    "allref_85-05_nan-0.1_00-1.0")
        data_dir = os.path.join(quick_data_dir,
                                "allnonref_85-05_nan-0.1_00-1.0")
        data_model_sim8595 = GagesModel.load_datamodel(
            sim_data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_sim9505 = GagesModel.load_datamodel(
            sim_data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        data_model_9505 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        sim_gages_model_train = GagesModel.update_data_model(
            self.sim_config_data, data_model_sim8595, data_attr_update=True)
        gages_model_train = GagesModel.update_data_model(self.config_data,
                                                         data_model_8595,
                                                         data_attr_update=True)
        sim_gages_model_test = GagesModel.update_data_model(
            self.sim_config_data,
            data_model_sim9505,
            data_attr_update=True,
            train_stat_dict=sim_gages_model_train.stat_dict)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_9505,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        data_input = GagesDamDataModel(gages_model_test, nid_input, True,
                                       gage_main_dam_purpose)
        for i in range(0, gage_main_dam_purpose_unique.size):
            sim_gages_model_test.update_model_param('train', nEpoch=300)
            gages_input = choose_which_purpose(
                data_input, purpose=gage_main_dam_purpose_unique[i])
            new_temp_dir = os.path.join(
                gages_input.data_source.data_config.model_dict["dir"]["Temp"],
                gage_main_dam_purpose_unique[i])
            new_out_dir = os.path.join(
                gages_input.data_source.data_config.model_dict["dir"]["Out"],
                gage_main_dam_purpose_unique[i])
            gages_input.update_datamodel_dir(new_temp_dir, new_out_dir)
            model_input = GagesSimDataModel(sim_gages_model_test, gages_input)
            pred, obs = master_test_natural_flow(model_input,
                                                 epoch=self.test_epoch)
            basin_area = model_input.data_model2.data_source.read_attr(
                model_input.data_model2.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                is_return_dict=False)
            mean_prep = model_input.data_model2.data_source.read_attr(
                model_input.data_model2.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                is_return_dict=False)
            mean_prep = mean_prep / 365 * 10
            pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
            obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
            save_result(
                model_input.data_model2.data_source.data_config.
                data_path['Temp'], str(self.test_epoch), pred, obs)
            plot_we_need(gages_input,
                         obs,
                         pred,
                         id_col="STAID",
                         lon_col="LNG_GAGE",
                         lat_col="LAT_GAGE")
Exemple #28
0
    def test_3factors(self):
        data_model = self.data_model
        config_data = self.config_data
        test_epoch = self.test_epoch
        # plot three factors
        attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
        usgs_id = data_model.t_s_dict["sites_id"]
        attrs_runavg_stor = data_model.data_source.read_attr(
            usgs_id, attr_lst, is_return_dict=False)
        run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6
                                                          )  # m^3 per year
        nor_storage = attrs_runavg_stor[:, 1] * 1000  # m^3
        dors_value = nor_storage / run_avg
        dors = np.full(len(usgs_id), "dor<0.02")
        for i in range(len(usgs_id)):
            if dors_value[i] >= 0.02:
                dors[i] = "dor≥0.02"

        diversions = np.full(len(usgs_id), "no ")
        diversion_strs = ["diversion", "divert"]
        attr_lst = ["WR_REPORT_REMARKS", "SCREENING_COMMENTS"]
        data_attr = data_model.data_source.read_attr_origin(usgs_id, attr_lst)
        diversion_strs_lower = [elem.lower() for elem in diversion_strs]
        data_attr0_lower = np.array([
            elem.lower() if type(elem) == str else elem
            for elem in data_attr[0]
        ])
        data_attr1_lower = np.array([
            elem.lower() if type(elem) == str else elem
            for elem in data_attr[1]
        ])
        data_attr_lower = np.vstack((data_attr0_lower, data_attr1_lower)).T
        for i in range(len(usgs_id)):
            if is_any_elem_in_a_lst(diversion_strs_lower,
                                    data_attr_lower[i],
                                    include=True):
                diversions[i] = "yes"

        nid_dir = os.path.join(
            "/".join(config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_lst_merge = "".join(gage_main_dam_purpose_lst)
        gage_main_dam_purpose_unique = np.unique(
            list(gage_main_dam_purpose_lst_merge))
        # gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        purpose_regions = {}
        for i in range(gage_main_dam_purpose_unique.size):
            sites_id = []
            for key, value in gage_main_dam_purpose.items():
                if gage_main_dam_purpose_unique[i] in value:
                    sites_id.append(key)
            assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
            purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
        id_regions_idx = []
        id_regions_sites_ids = []
        regions_name = []
        show_min_num = 10
        df_id_region = np.array(data_model.t_s_dict["sites_id"])
        for key, value in purpose_regions.items():
            gages_id = value
            c, ind1, ind2 = np.intersect1d(df_id_region,
                                           gages_id,
                                           return_indices=True)
            if c.size < show_min_num:
                continue
            assert (all(x < y for x, y in zip(ind1, ind1[1:])))
            assert (all(x < y for x, y in zip(c, c[1:])))
            id_regions_idx.append(ind1)
            id_regions_sites_ids.append(c)
            regions_name.append(key)
        preds, obss, inds_dfs = split_results_to_regions(
            data_model, test_epoch, id_regions_idx, id_regions_sites_ids)
        frames = []
        x_name = "purposes"
        y_name = "NSE"
        hue_name = "DOR"
        col_name = "diversion"
        for i in range(len(id_regions_idx)):
            # plot box,使用seaborn库
            keys = ["NSE"]
            inds_test = subset_of_dict(inds_dfs[i], keys)
            inds_test = inds_test[keys[0]].values
            df_dict_i = {}
            str_i = regions_name[i]
            df_dict_i[x_name] = np.full([inds_test.size], str_i)
            df_dict_i[y_name] = inds_test
            df_dict_i[hue_name] = dors[id_regions_idx[i]]
            df_dict_i[col_name] = diversions[id_regions_idx[i]]
            # df_dict_i[hue_name] = nor_storage[id_regions_idx[i]]
            df_i = pd.DataFrame(df_dict_i)
            frames.append(df_i)
        result = pd.concat(frames)
        plot_boxs(result, x_name, y_name, ylim=[0, 1.0])
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'purpose_distribution.png'),
                    dpi=500,
                    bbox_inches="tight")
        # g = sns.catplot(x=x_name, y=y_name, hue=hue_name, col=col_name,
        #                 data=result, kind="swarm",
        #                 height=4, aspect=.7)
        sns.set(font_scale=1.5)
        fig, ax = plt.subplots()
        fig.set_size_inches(11.7, 8.27)
        g = sns.catplot(ax=ax,
                        x=x_name,
                        y=y_name,
                        hue=hue_name,
                        col=col_name,
                        data=result,
                        palette="Set1",
                        kind="box",
                        dodge=True,
                        showfliers=False)
        # g.set(ylim=(-1, 1))
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 '3factors_distribution.png'),
                    dpi=500,
                    bbox_inches="tight")
        plt.show()
Exemple #29
0
    def test_scatter_dam_purpose(self):
        attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
        sites_nonref = self.data_model.t_s_dict["sites_id"]
        attrs_runavg_stor = self.data_model.data_source.read_attr(
            sites_nonref, attr_lst, is_return_dict=False)
        run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6
                                                          )  # m^3 per year
        nor_storage = attrs_runavg_stor[:, 1] * 1000  # m^3
        dors = nor_storage / run_avg

        nid_dir = os.path.join(self.config_data.data_path["DB"], "nid", "test")
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        purpose_regions = {}
        for i in range(gage_main_dam_purpose_unique.size):
            sites_id = []
            for key, value in gage_main_dam_purpose.items():
                if value == gage_main_dam_purpose_unique[i]:
                    sites_id.append(key)
            assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
            purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
        id_regions_idx = []
        id_regions_sites_ids = []
        regions_name = []
        show_min_num = 10
        df_id_region = np.array(self.data_model.t_s_dict["sites_id"])
        for key, value in purpose_regions.items():
            gages_id = value
            c, ind1, ind2 = np.intersect1d(df_id_region,
                                           gages_id,
                                           return_indices=True)
            if c.size < show_min_num:
                continue
            assert (all(x < y for x, y in zip(ind1, ind1[1:])))
            assert (all(x < y for x, y in zip(c, c[1:])))
            id_regions_idx.append(ind1)
            id_regions_sites_ids.append(c)
            regions_name.append(key)
        preds, obss, inds_dfs = split_results_to_regions(
            self.data_model, self.test_epoch, id_regions_idx,
            id_regions_sites_ids)
        frames = []
        x_name = "purposes"
        y_name = "NSE"
        hue_name = "DOR"
        # hue_name = "STOR"
        for i in range(len(id_regions_idx)):
            # plot box,使用seaborn库
            keys = ["NSE"]
            inds_test = subset_of_dict(inds_dfs[i], keys)
            inds_test = inds_test[keys[0]].values
            df_dict_i = {}
            str_i = regions_name[i]
            df_dict_i[x_name] = np.full([inds_test.size], str_i)
            df_dict_i[y_name] = inds_test
            df_dict_i[hue_name] = dors[id_regions_idx[i]]
            # df_dict_i[hue_name] = nor_storage[id_regions_idx[i]]
            df_i = pd.DataFrame(df_dict_i)
            frames.append(df_i)
        result = pd.concat(frames)
        # can remove high hue value to keep a good map
        plot_boxs(result, x_name, y_name, ylim=[-1.0, 1.0])
        plt.savefig(os.path.join(self.config_data.data_path["Out"],
                                 'purpose_distribution_test.png'),
                    dpi=500,
                    bbox_inches="tight")
        plt.show()
        # plot_boxs(result, x_name, y_name, uniform_color="skyblue", swarm_plot=True, hue=hue_name, colormap=True,
        #           ylim=[-1.0, 1.0])
        cmap_str = 'viridis'
        # cmap = plt.get_cmap('Spectral')
        cbar_label = hue_name

        plt.title('Distribution of different purposes')
        swarmplot_with_cbar(cmap_str,
                            cbar_label, [-1, 1.0],
                            x=x_name,
                            y=y_name,
                            hue=hue_name,
                            palette=cmap_str,
                            data=result)