def get_dataset_names(ref_key, datasets, n=3): """ Get dataset names in correct order as used in the validation framework - reference dataset = ref - first other dataset = k1 - second other dataset = k2 This is important to correctly iterate through the H-SAF metrics and to save each metric with the name of the used datasets Parameters ---------- ref_key: basestring Name of the reference dataset datasets: dict Dictionary of dictionaries as provided to the validation framework in order to perform the validation process. Returns ------- dataset_names: list List of the dataset names in correct order """ ds_dict = {} for ds in datasets.keys(): ds_dict[ds] = datasets[ds]['columns'] ds_names = get_result_names(ds_dict, ref_key, n) dataset_names = [] for name in ds_names[0]: dataset_names.append(name[0]) return dataset_names
def get_dataset_names(ref_key, datasets, n=3): """ Get dataset names in correct order as used in the validation framework -) reference dataset = ref -) first other dataset = k1 -) second other dataset = k2 This is important to correctly iterate through the H-SAF metrics and to save each metric with the name of the used datasets Parameters ---------- ref_key: basestring Name of the reference dataset datasets: dict Dictionary of dictionaries as provided to the validation framework in order to perform the validation process. Returns ------- dataset_names: list List of the dataset names in correct order """ ds_dict = {} for ds in datasets.keys(): ds_dict[ds] = datasets[ds]['columns'] ds_names = get_result_names(ds_dict, ref_key, n) dataset_names = [] for name in ds_names[0]: dataset_names.append(name[0]) return dataset_names
def test_get_result_names(): tst_ds_dict = {'DS1': ['soil moisture'], 'DS2': ['sm'], 'DS3': ['sm', 'sm2']} result_names = get_result_names(tst_ds_dict, 'DS1', 3) assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm')), (('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm2'))] result_names = get_result_names(tst_ds_dict, 'DS1', 2) assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm')), (('DS1', 'soil moisture'), ('DS3', 'sm')), (('DS1', 'soil moisture'), ('DS3', 'sm2'))] result_names = get_result_names(tst_ds_dict, 'DS2', 2) assert result_names == [(('DS2', 'sm'), ('DS1', 'soil moisture')), (('DS2', 'sm'), ('DS3', 'sm')), (('DS2', 'sm'), ('DS3', 'sm2'))]
def mask_dataset(self, ref_df, gpi_info): """ Mask the temporal reference dataset with the data read through the masking datasets. Parameters ---------- gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- mask: numpy.ndarray boolean array of the size of the temporal reference read """ matched_masking = self.temporal_match_masking_data(ref_df, gpi_info) # this will only be one element since n is the same as the # number of masking datasets result_names = get_result_names(self.masking_dm.ds_dict, '_reference', n=2) choose_all = pd.DataFrame(index=ref_df.index) for data, result in self.k_datasets_from(matched_masking, result_names, include_scaling_ref=False): if len(data) == 0: continue for key in result: if key[0] != '_reference': # this is necessary since the boolean datatype might have # been changed to float 1.0 and 0.0 issue with temporal # resampling that is not easily resolved since most # datatypes have no nan representation. choose = pd.Series((data[key] == False), index=data.index) choose = choose.reindex(index=choose_all.index, fill_value=True) choose_all[key] = choose.copy() choosing = choose_all.apply(np.all, axis=1) return ref_df[choosing]
def mask_dataset(self, ref_df, gpi_info): """ Mask the temporal reference dataset with the data read through the masking datasets. Parameters ---------- gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- mask: numpy.ndarray boolean array of the size of the temporal reference read """ matched_masking = self.temporal_match_masking_data(ref_df, gpi_info) # this will only be one element since n is the same as the # number of masking datasets result_names = get_result_names(self.masking_dm.ds_dict, '_reference', n=2) choose_all = pd.DataFrame(index=ref_df.index) for data, result in self.k_datasets_from(matched_masking, result_names): if len(data) == 0: continue for key in result: if key[0] != '_reference': # this is necessary since the boolean datatype might have # been changed to float 1.0 and 0.0 issue with temporal # resampling that is not easily resolved since most # datatypes have no nan representation. choose = pd.Series((data[key] == False), index=data.index) choose = choose.reindex(index=choose_all.index, fill_value=True) choose_all[key] = choose.copy() choosing = choose_all.apply(np.all, axis=1) return ref_df[choosing]
def perform_validation(self, df_dict, gpi_info): """ Perform the validation for one grid point index and return the matched datasets as well as the calculated metrics. Parameters ---------- df_dict: dict of pandas.DataFrames DataFrames read by the data readers for each dataset gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- matched_n: dict of pandas.DataFrames temporally matched data stored by (n, k) tuples results: dict Dictonary of calculated metrics stored by dataset combinations tuples. used_data: dict The DataFrame used for calculation of each set of metrics. """ results = {} used_data = {} matched_n = {} if self.masking_dm is not None: ref_df = df_dict[self.temporal_ref] masked_ref_df = self.mask_dataset(ref_df, gpi_info) if len(masked_ref_df) == 0: return matched_n, results, used_data df_dict[self.temporal_ref] = masked_ref_df matched_n = self.temporal_match_datasets(df_dict) for n, k in self.metrics_c: n_matched_data = matched_n[(n, k)] if len(n_matched_data) == 0: continue result_names = get_result_names(self.data_manager.ds_dict, self.temporal_ref, n=k) for data, result_key in self.k_datasets_from( n_matched_data, result_names): if len(data) == 0: continue # at this stage we can drop the column multiindex and just use # the dataset name if LooseVersion(pd.__version__) < LooseVersion('0.23'): data.columns = data.columns.droplevel(level=1) else: data = data.rename(columns=lambda x: x[0]) if self.scaling is not None: # get scaling index by finding the column in the # DataFrame that belongs to the scaling reference scaling_index = data.columns.tolist().index( self.scaling_ref) try: data = self.scaling.scale(data, scaling_index, gpi_info) except ValueError: continue # Rename the columns to 'ref', 'k1', 'k2', ... rename_dict = {} f = lambda x: "k{}".format(x) if x > 0 else 'ref' for i, r in enumerate(result_key): rename_dict[r[0]] = f(i) data.rename(columns=rename_dict, inplace=True) if result_key not in results.keys(): results[result_key] = [] metrics_calculator = self.metrics_c[(n, k)] used_data[result_key] = data metrics = metrics_calculator(data, gpi_info) results[result_key].append(metrics) return matched_n, results, used_data
def perform_validation(self, df_dict, gpi_info): """ Perform the validation for one grid point index and return the matched datasets as well as the calculated metrics. Parameters ---------- df_dict: dict of pandas.DataFrames DataFrames read by the data readers for each dataset gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- matched_n: dict of pandas.DataFrames temporally matched data stored by (n, k) tuples results: dict Dictonary of calculated metrics stored by dataset combinations tuples. used_data: dict The DataFrame used for calculation of each set of metrics. """ results = {} used_data = {} matched_n = {} if self.masking_dm is not None: ref_df = df_dict[self.temporal_ref] masked_ref_df = self.mask_dataset(ref_df, gpi_info) if len(masked_ref_df) == 0: return matched_n, results, used_data df_dict[self.temporal_ref] = masked_ref_df matched_n = self.temporal_match_datasets(df_dict) for n, k in self.metrics_c: n_matched_data = matched_n[(n, k)] if len(n_matched_data) == 0: continue result_names = get_result_names(self.data_manager.ds_dict, self.temporal_ref, n=k) for data, result_key in self.k_datasets_from(n_matched_data, result_names): if len(data) == 0: continue # at this stage we can drop the column multiindex and just use # the dataset name if LooseVersion(pd.__version__) < LooseVersion('0.23'): data.columns = data.columns.droplevel(level=1) else: data = data.rename(columns=lambda x: x[0]) if self.scaling is not None: # get scaling index by finding the column in the # DataFrame that belongs to the scaling reference scaling_index = data.columns.tolist().index(self.scaling_ref) try: data = self.scaling.scale(data, scaling_index, gpi_info) except ValueError: continue # Rename the columns to 'ref', 'k1', 'k2', ... rename_dict = {} f = lambda x: "k{}".format(x) if x > 0 else 'ref' for i, r in enumerate(result_key): rename_dict[r[0]] = f(i) data.rename(columns=rename_dict, inplace=True) if result_key not in results.keys(): results[result_key] = [] metrics_calculator = self.metrics_c[(n, k)] used_data[result_key] = data metrics = metrics_calculator(data, gpi_info) results[result_key].append(metrics) return matched_n, results, used_data