Esempio n. 1
0
def get_dataset_names(ref_key, datasets, n=3):
    """
    Get dataset names in correct order as used in the validation framework

    - reference dataset = ref
    - first other dataset = k1
    - second other dataset = k2

    This is important to correctly iterate through the H-SAF metrics and to
    save each metric with the name of the used datasets

    Parameters
    ----------
    ref_key: basestring
        Name of the reference dataset
    datasets: dict
        Dictionary of dictionaries as provided to the validation framework
        in order to perform the validation process.

    Returns
    -------
    dataset_names: list
        List of the dataset names in correct order

    """
    ds_dict = {}
    for ds in datasets.keys():
        ds_dict[ds] = datasets[ds]['columns']
    ds_names = get_result_names(ds_dict, ref_key, n)
    dataset_names = []
    for name in ds_names[0]:
        dataset_names.append(name[0])

    return dataset_names
Esempio n. 2
0
def get_dataset_names(ref_key, datasets, n=3):
    """
    Get dataset names in correct order as used in the validation framework
        -) reference dataset = ref
        -) first other dataset = k1
        -) second other dataset = k2
    This is important to correctly iterate through the H-SAF metrics and to
    save each metric with the name of the used datasets

    Parameters
    ----------
    ref_key: basestring
        Name of the reference dataset
    datasets: dict
        Dictionary of dictionaries as provided to the validation framework
        in order to perform the validation process.

    Returns
    -------
    dataset_names: list
        List of the dataset names in correct order

    """
    ds_dict = {}
    for ds in datasets.keys():
        ds_dict[ds] = datasets[ds]['columns']
    ds_names = get_result_names(ds_dict, ref_key, n)
    dataset_names = []
    for name in ds_names[0]:
        dataset_names.append(name[0])

    return dataset_names
Esempio n. 3
0
def test_get_result_names():

    tst_ds_dict = {'DS1': ['soil moisture'],
                   'DS2': ['sm'],
                   'DS3': ['sm', 'sm2']}
    result_names = get_result_names(tst_ds_dict, 'DS1', 3)
    assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm')),
                            (('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm2'))]

    result_names = get_result_names(tst_ds_dict, 'DS1', 2)
    assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm')),
                            (('DS1', 'soil moisture'), ('DS3', 'sm')),
                            (('DS1', 'soil moisture'), ('DS3', 'sm2'))]

    result_names = get_result_names(tst_ds_dict, 'DS2', 2)
    assert result_names == [(('DS2', 'sm'), ('DS1', 'soil moisture')),
                            (('DS2', 'sm'), ('DS3', 'sm')),
                            (('DS2', 'sm'), ('DS3', 'sm2'))]
Esempio n. 4
0
def test_get_result_names():

    tst_ds_dict = {'DS1': ['soil moisture'],
                   'DS2': ['sm'],
                   'DS3': ['sm', 'sm2']}
    result_names = get_result_names(tst_ds_dict, 'DS1', 3)
    assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm')),
                            (('DS1', 'soil moisture'), ('DS2', 'sm'), ('DS3', 'sm2'))]

    result_names = get_result_names(tst_ds_dict, 'DS1', 2)
    assert result_names == [(('DS1', 'soil moisture'), ('DS2', 'sm')),
                            (('DS1', 'soil moisture'), ('DS3', 'sm')),
                            (('DS1', 'soil moisture'), ('DS3', 'sm2'))]

    result_names = get_result_names(tst_ds_dict, 'DS2', 2)
    assert result_names == [(('DS2', 'sm'), ('DS1', 'soil moisture')),
                            (('DS2', 'sm'), ('DS3', 'sm')),
                            (('DS2', 'sm'), ('DS3', 'sm2'))]
Esempio n. 5
0
    def mask_dataset(self, ref_df, gpi_info):
        """
        Mask the temporal reference dataset with the data read
        through the masking datasets.

        Parameters
        ----------
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        mask: numpy.ndarray
            boolean array of the size of the temporal reference read
        """

        matched_masking = self.temporal_match_masking_data(ref_df, gpi_info)
        # this will only be one element since n is the same as the
        # number of masking datasets
        result_names = get_result_names(self.masking_dm.ds_dict,
                                        '_reference',
                                        n=2)
        choose_all = pd.DataFrame(index=ref_df.index)
        for data, result in self.k_datasets_from(matched_masking,
                                                 result_names,
                                                 include_scaling_ref=False):
            if len(data) == 0:
                continue

            for key in result:
                if key[0] != '_reference':
                    # this is necessary since the boolean datatype might have
                    # been changed to float 1.0 and 0.0 issue with temporal
                    # resampling that is not easily resolved since most
                    # datatypes have no nan representation.
                    choose = pd.Series((data[key] == False), index=data.index)
                    choose = choose.reindex(index=choose_all.index,
                                            fill_value=True)
                    choose_all[key] = choose.copy()
        choosing = choose_all.apply(np.all, axis=1)

        return ref_df[choosing]
Esempio n. 6
0
    def mask_dataset(self, ref_df, gpi_info):
        """
        Mask the temporal reference dataset with the data read
        through the masking datasets.

        Parameters
        ----------
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        mask: numpy.ndarray
            boolean array of the size of the temporal reference read
        """

        matched_masking = self.temporal_match_masking_data(ref_df, gpi_info)
        # this will only be one element since n is the same as the
        # number of masking datasets
        result_names = get_result_names(self.masking_dm.ds_dict,
                                        '_reference',
                                        n=2)
        choose_all = pd.DataFrame(index=ref_df.index)
        for data, result in self.k_datasets_from(matched_masking,
                                                 result_names):
            if len(data) == 0:
                continue

            for key in result:
                if key[0] != '_reference':
                    # this is necessary since the boolean datatype might have
                    # been changed to float 1.0 and 0.0 issue with temporal
                    # resampling that is not easily resolved since most
                    # datatypes have no nan representation.
                    choose = pd.Series((data[key] == False), index=data.index)
                    choose = choose.reindex(index=choose_all.index,
                                            fill_value=True)
                    choose_all[key] = choose.copy()
        choosing = choose_all.apply(np.all, axis=1)

        return ref_df[choosing]
Esempio n. 7
0
    def perform_validation(self, df_dict, gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df, gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_names(self.data_manager.ds_dict,
                                            self.temporal_ref,
                                            n=k)
            for data, result_key in self.k_datasets_from(
                    n_matched_data, result_names):

                if len(data) == 0:
                    continue

                # at this stage we can drop the column multiindex and just use
                # the dataset name
                if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                    data.columns = data.columns.droplevel(level=1)
                else:
                    data = data.rename(columns=lambda x: x[0])

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(
                        self.scaling_ref)
                    try:
                        data = self.scaling.scale(data, scaling_index,
                                                  gpi_info)
                    except ValueError:
                        continue
                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data
Esempio n. 8
0
    def perform_validation(self,
                           df_dict,
                           gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df,
                                              gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_names(self.data_manager.ds_dict,
                                            self.temporal_ref,
                                            n=k)
            for data, result_key in self.k_datasets_from(n_matched_data,
                                                         result_names):

                if len(data) == 0:
                    continue

                # at this stage we can drop the column multiindex and just use
                # the dataset name
                if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                    data.columns = data.columns.droplevel(level=1)
                else:
                    data = data.rename(columns=lambda x: x[0])

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(self.scaling_ref)
                    try:
                        data = self.scaling.scale(data,
                                                  scaling_index,
                                                  gpi_info)
                    except ValueError:
                        continue
                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data