Esempio n. 1
0
def test_DataManager_get_data():

    datasets = setup_TestDatasets()
    dm = DataManager(datasets,
                     'DS1',
                     read_ts_names={f'DS{i}': 'read'
                                    for i in range(1, 4)})
    data = dm.get_data(1, 1, 1)
    assert sorted(list(data)) == ['DS1', 'DS2', 'DS3']
Esempio n. 2
0
    def __init__(self, datasets, spatial_ref, metrics_calculators,
                 temporal_matcher=None, temporal_window=1 / 24.0,
                 temporal_ref=None,
                 masking_datasets=None,
                 period=None,
                 scaling='lin_cdf_match', scaling_ref=None):

        if isinstance(datasets, DataManager):
            self.data_manager = datasets
        else:
            self.data_manager = DataManager(datasets, spatial_ref, period)

        self.temp_matching = temporal_matcher
        if self.temp_matching is None:
            warnings.warn(
                "You are using the default temporal matcher. If you are using one of the"
                " newer metric calculators (PairwiseIntercomparisonMetrics,"
                " TripleCollocationMetrics) you should probably use `make_combined_temporal_matcher`"
                " instead. Have a look at the documentation of the metric calculators for more info."
            )
            self.temp_matching = temporal_matchers.BasicTemporalMatching(
                window=temporal_window).combinatory_matcher

        self.temporal_ref = temporal_ref
        if self.temporal_ref is None:
            self.temporal_ref = self.data_manager.reference_name

        self.metrics_c = metrics_calculators
        for n, k in self.metrics_c:
            if n < len(self.data_manager.datasets.keys()):
                raise ValueError('n must be equal to the number of datasets')

        self.masking_dm = None
        if masking_datasets is not None:
            # add temporal reference dataset to the masking datasets since it
            # is necessary for temporally matching the masking datasets to the
            # common time stamps. Use _reference here to make a clash with the
            # names of the masking datasets unlikely
            masking_datasets.update(
                {'_reference': datasets[self.temporal_ref]})
            self.masking_dm = DataManager(masking_datasets, '_reference',
                                          period=period)

        if type(scaling) == str:
            self.scaling = DefaultScaler(scaling)
        else:
            self.scaling = scaling
        self.scaling_ref = scaling_ref
        if self.scaling_ref is None:
            self.scaling_ref = self.data_manager.reference_name

        self.luts = self.data_manager.get_luts()
Esempio n. 3
0
    def __init__(self,
                 datasets,
                 spatial_ref,
                 metrics_calculators,
                 temporal_matcher=None,
                 temporal_window=1 / 24.0,
                 temporal_ref=None,
                 masking_datasets=None,
                 period=None,
                 scaling='lin_cdf_match',
                 scaling_ref=None):

        if type(datasets) is DataManager:
            self.data_manager = datasets
        else:
            self.data_manager = DataManager(datasets, spatial_ref, period)

        self.temp_matching = temporal_matcher
        if self.temp_matching is None:
            self.temp_matching = temporal_matchers.BasicTemporalMatching(
                window=temporal_window).combinatory_matcher

        self.temporal_ref = temporal_ref
        if self.temporal_ref is None:
            self.temporal_ref = self.data_manager.reference_name

        self.metrics_c = metrics_calculators
        for n, k in self.metrics_c:
            if n < len(self.data_manager.datasets.keys()):
                raise ValueError('n must be equal to the number of datasets')

        self.masking_dm = None
        if masking_datasets is not None:
            # add temporal reference dataset to the masking datasets since it
            # is necessary for temporally matching the masking datasets to the
            # common time stamps. Use _reference here to make a clash with the
            # names of the masking datasets unlikely
            masking_datasets.update(
                {'_reference': datasets[self.temporal_ref]})
            self.masking_dm = DataManager(masking_datasets,
                                          '_reference',
                                          period=period)

        if type(scaling) == str:
            self.scaling = DefaultScaler(scaling)
        else:
            self.scaling = scaling
        self.scaling_ref = scaling_ref
        if self.scaling_ref is None:
            self.scaling_ref = self.data_manager.reference_name

        self.luts = self.data_manager.get_luts()
Esempio n. 4
0
def test_validation_error_n2_k2():

    datasets = setup_TestDatasets()

    dm = DataManager(
        datasets,
        "DS1",
        read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]},
    )

    # n less than number of datasets is no longer allowed
    with pytest.raises(ValueError):
        Validation(
            dm,
            "DS1",
            temporal_matcher=temporal_matchers.BasicTemporalMatching(
                window=1 / 24.0
            ).combinatory_matcher,
            scaling="lin_cdf_match",
            metrics_calculators={
                (2, 2): metrics_calculators.BasicMetrics(
                    other_name="k1"
                ).calc_metrics
            },
        )
Esempio n. 5
0
def test_validation_n3_k2_temporal_matching_no_matches():

    tst_results = {}

    datasets = setup_two_without_overlap()

    dm = DataManager(
        datasets,
        "DS1",
        read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]},
    )

    process = Validation(
        dm,
        "DS1",
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0
        ).combinatory_matcher,
        scaling="lin_cdf_match",
        metrics_calculators={
            (3, 2): metrics_calculators.BasicMetrics(
                other_name="k1"
            ).calc_metrics
        },
    )

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
Esempio n. 6
0
def setup_TestDataManager():

    grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]))

    ds1 = GriddedTsBase("", grid, TestDatasetRuntimeError)
    ds2 = GriddedTsBase("", grid, TestDatasetRuntimeError)
    ds3 = GriddedTsBase("", grid, TestDatasetRuntimeError,
                        ioclass_kws={'message': 'Other RuntimeError'})

    datasets = {
        'DS1': {
            'class': ds1,
            'columns': ['soil moisture'],
            'args': [],
            'kwargs': {}
        },
        'DS2': {
            'class': ds2,
            'columns': ['sm'],
            'args': [],
            'kwargs': {},
            'grids_compatible': True
        },
        'DS3': {
            'class': ds3,
            'columns': ['sm', 'sm2'],
            'args': [],
            'kwargs': {},
            'grids_compatible': True
        }
    }

    dm = DataManager(datasets, 'DS1')
    return dm
Esempio n. 7
0
def test_validation_n3_k2_temporal_matching_no_matches2():

    tst_results = {
        (("DS1", "x"), ("DS3", "y")): {
            "n_obs": np.array([1000], dtype=np.int32),
            "tau": np.array([np.nan], dtype=np.float32),
            "gpi": np.array([4], dtype=np.int32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "lon": np.array([4.0]),
            "p_tau": np.array([np.nan], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "lat": np.array([4.0]),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
        },
        (("DS1", "x"), ("DS3", "x")): {
            "n_obs": np.array([1000], dtype=np.int32),
            "tau": np.array([np.nan], dtype=np.float32),
            "gpi": np.array([4], dtype=np.int32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "lon": np.array([4.0]),
            "p_tau": np.array([np.nan], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "lat": np.array([4.0]),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
        },
    }

    datasets = setup_three_with_two_overlapping()
    dm = DataManager(
        datasets,
        "DS1",
        read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]},
    )

    process = Validation(
        dm,
        "DS1",
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0
        ).combinatory_matcher,
        scaling="lin_cdf_match",
        metrics_calculators={
            (3, 2): metrics_calculators.BasicMetrics(
                other_name="k1"
            ).calc_metrics
        },
    )

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
Esempio n. 8
0
def test_validation_n2_k2_data_manager_argument():

    tst_results = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)},
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)},
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)}}

    datasets = setup_TestDatasets()
    dm = DataManager(datasets, 'DS1')

    process = Validation(dm, 'DS1',
                         temporal_matcher=temporal_matchers.BasicTemporalMatching(
                             window=1 / 24.0).combinatory_matcher,
                         scaling='lin_cdf_match',
                         metrics_calculators={
                             (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics})

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
Esempio n. 9
0
    def __init__(self, datasets, spatial_ref, metrics_calculators,
                 temporal_matcher=None, temporal_window=1 / 24.0,
                 temporal_ref=None,
                 masking_datasets=None,
                 period=None,
                 scaling='lin_cdf_match', scaling_ref=None):

        if type(datasets) is DataManager:
            self.data_manager = datasets
        else:
            self.data_manager = DataManager(datasets, spatial_ref, period)

        self.temp_matching = temporal_matcher
        if self.temp_matching is None:
            self.temp_matching = temporal_matchers.BasicTemporalMatching(
                window=temporal_window).combinatory_matcher

        self.temporal_ref = temporal_ref
        if self.temporal_ref is None:
            self.temporal_ref = self.data_manager.reference_name

        self.metrics_c = metrics_calculators

        self.masking_dm = None
        if masking_datasets is not None:
            # add temporal reference dataset to the masking datasets since it
            # is necessary for temporally matching the masking datasets to the
            # common time stamps. Use _reference here to make a clash with the
            # names of the masking datasets unlikely
            masking_datasets.update(
                {'_reference': datasets[self.temporal_ref]})
            self.masking_dm = DataManager(masking_datasets, '_reference',
                                          period=period)

        if type(scaling) == str:
            self.scaling = DefaultScaler(scaling)
        else:
            self.scaling = scaling
        self.scaling_ref = scaling_ref
        if self.scaling_ref is None:
            self.scaling_ref = self.data_manager.reference_name

        self.luts = self.data_manager.get_luts()
Esempio n. 10
0
def test_DataManager_read_ts_method_names():

    ds1 = TestDataset("")

    datasets = {
        'DS1': {
            'class': ds1,
            'columns': ['soil moisture'],
        },
        'DS2': {
            'class': ds1,
            'columns': ['soil moisture'],
        }
    }

    read_ts_method_names = {'DS1': 'read_ts', 'DS2': 'read_ts_other'}
    dm = DataManager(datasets, 'DS1', read_ts_names=read_ts_method_names)
    data = dm.read_ds('DS1', 1)
    data_other = dm.read_ds('DS2', 1)
    pdtest.assert_frame_equal(data, ds1.read_ts(1))
    pdtest.assert_frame_equal(data_other, ds1.read_ts_other(1))
Esempio n. 11
0
def test_DataManager_read_ts_method_names():

    ds1 = TestDataset("")

    datasets = {
        'DS1': {
            'class': ds1,
            'columns': ['soil moisture'],
        },
        'DS2': {
            'class': ds1,
            'columns': ['soil moisture'],
        }
    }

    read_ts_method_names = {'DS1': 'read_ts',
                            'DS2': 'read_ts_other'}
    dm = DataManager(datasets, 'DS1',
                     read_ts_names=read_ts_method_names)
    data = dm.read_ds('DS1', 1)
    data_other = dm.read_ds('DS2', 1)
    pdtest.assert_frame_equal(data, ds1.read_ts(1))
    pdtest.assert_frame_equal(data_other, ds1.read_ts_other(1))
Esempio n. 12
0
def test_validation_error_n2_k2():

    datasets = setup_TestDatasets()

    dm = DataManager(datasets, 'DS1', read_ts_names={d: 'read' for d in ['DS1', 'DS2', 'DS3']})

    # n less than number of datasets is no longer allowed
    with pytest.raises(ValueError):
        process = Validation(
            dm, 'DS1',
            temporal_matcher=temporal_matchers.BasicTemporalMatching(
                window=1 / 24.0).combinatory_matcher,
            scaling='lin_cdf_match',
            metrics_calculators={
                (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics})
Esempio n. 13
0
def test_DataManager_default_add():

    grid = grids.CellGrid(np.array([1, 2, 3, 4]),
                          np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]),
                          gpis=np.array([1, 2, 3, 4]))

    ds1 = GriddedTsBase("", grid, TestDataset)

    datasets = {
        'DS1': {
            'class': ds1,
            'columns': ['soil moisture'],
        },
        'DS2': {
            'class': ds1,
            'columns': ['soil moisture'],
        }
    }

    dm = DataManager(datasets, 'DS1')
    assert dm.datasets == {
        'DS1': {
            'class': ds1,
            'columns': ['soil moisture'],
            'args': [],
            'kwargs': {},
            'use_lut': False,
            'lut_max_dist': None,
            'grids_compatible': False
        },
        'DS2': {
            'class': ds1,
            'columns': ['soil moisture'],
            'args': [],
            'kwargs': {},
            'use_lut': False,
            'lut_max_dist': None,
            'grids_compatible': False
        }
    }
Esempio n. 14
0
    def __init__(self, datasets, temporal_matcher, metrics_calculator,
                 data_prep=None, data_post=None, period=None,
                 scaling='lin_cdf_match', scale_to_other=False,
                 cell_based_jobs=True):
        """
        Initialize parameters.
        """
        self.data_manager = DataManager(datasets, data_prep, period)

        self.temp_matching = temporal_matcher.match
        self.calc_metrics = metrics_calculator.calc_metrics
        self.data_postproc = data_post

        self.scaling = scaling
        self.scale_to_index = 0
        if scale_to_other:
            self.scale_to_index = 1

        self.cell_based_jobs = cell_based_jobs

        self.luts = self.data_manager.get_luts()
Esempio n. 15
0
def test_DataManager_get_data():

    datasets = setup_TestDatasets()
    dm = DataManager(datasets, 'DS1')
    data = dm.get_data(1, 1, 1)
    assert sorted(list(data)) == ['DS1', 'DS2', 'DS3']
Esempio n. 16
0
class Validation(object):
    """
    Class for the validation process.

    Parameters
    ----------
    datasets : dict of dicts, or :py:class:`pytesmo.validation_framework.data_manager.DataManager`
        :Keys: string, datasets names
        :Values: dict, containing the following fields

            'class': object
                Class containing the method read_ts for reading the data.
            'columns': list
                List of columns which will be used in the validation process.
            'args': list, optional
                Args for reading the data.
            'kwargs': dict, optional
                Kwargs for reading the data
            'grids_compatible': boolean, optional
                If set to True the grid point index is used directly when
                reading other, if False then lon, lat is used and a nearest
                neighbour search is necessary.
            'use_lut': boolean, optional
                If set to True the grid point index (obtained from a
                calculated lut between reference and other) is used when
                reading other, if False then lon, lat is used and a
                nearest neighbour search is necessary.
            'lut_max_dist': float, optional
                Maximum allowed distance in meters for the lut calculation.
    spatial_ref: string
        Name of the dataset used as a spatial, temporal and scaling reference.
        temporal and scaling references can be changed if needed. See the optional parameters
        ``temporal_ref`` and ``scaling_ref``.
    metrics_calculators : dict of functions
        The keys of the dict are tuples with the following structure: (n, k) with n >= 2
        and n>=k. n must be equal to the number of datasets now.
        n is the number of datasets that should be temporally matched to the
        reference dataset and k is how many columns the metric calculator will get at once.
        What this means is that it is e.g. possible to temporally match 3 datasets with
        3 columns in total and then give the combinations of these columns to the metric
        calculator in sets of 2 by specifying the dictionary like:

        .. code::

            { (3, 2): metric_calculator}

        The values are functions that take an input DataFrame with the columns 'ref'
        for the reference and 'n1', 'n2' and
        so on for other datasets as well as a dictionary mapping the column names
        to the names of the original datasets. In this way multiple metric calculators
        can be applied to different combinations of n input datasets.
    temporal_matcher: function, optional
        function that takes a dict of dataframes and a reference_key.
        It performs the temporal matching on the data and returns a dictionary
        of matched DataFrames that should be evaluated together by the metric calculator.
    temporal_window: float, optional
        Window to allow in temporal matching in days. The window is allowed on both
        sides of the timestamp of the temporal reference data.
        Only used with the standard temporal matcher.
    temporal_ref: string, optional
        If the temporal matching should use another dataset than the spatial reference
        as a reference dataset then give the dataset name here.
    period : list, optional
        Of type [datetime start, datetime end]. If given then the two input
        datasets will be truncated to start <= dates <= end.
    masking_datasets : dict of dictionaries
        Same format as the datasets with the difference that the read_ts method of these
        datasets has to return pandas.DataFrames with only boolean columns. True means that the
        observations at this timestamp should be masked and False means that it should be kept.
    scaling : string, None or class instance
        - If set then the data will be scaled into the reference space using the
          method specified by the string using the
          :py:class:`pytesmo.validation_framework.data_scalers.DefaultScaler` class.
        - If set to None then no scaling will be performed.
        - It can also be set to a class instance that implements a
          ``scale(self, data, reference_index, gpi_info)`` method. See
          :py:class:`pytesmo.validation_framework.data_scalers.DefaultScaler` for an example.
    scaling_ref : string, optional
        If the scaling should be done to another dataset than the spatial reference then
        give the dataset name here.

    Methods
    -------
    calc(job)
        Takes either a cell or a gpi_info tuple and performs the validation.
    get_processing_jobs()
        Returns processing jobs that this process can understand.
    """
    def __init__(self,
                 datasets,
                 spatial_ref,
                 metrics_calculators,
                 temporal_matcher=None,
                 temporal_window=1 / 24.0,
                 temporal_ref=None,
                 masking_datasets=None,
                 period=None,
                 scaling='lin_cdf_match',
                 scaling_ref=None):

        if type(datasets) is DataManager:
            self.data_manager = datasets
        else:
            self.data_manager = DataManager(datasets, spatial_ref, period)

        self.temp_matching = temporal_matcher
        if self.temp_matching is None:
            self.temp_matching = temporal_matchers.BasicTemporalMatching(
                window=temporal_window).combinatory_matcher

        self.temporal_ref = temporal_ref
        if self.temporal_ref is None:
            self.temporal_ref = self.data_manager.reference_name

        self.metrics_c = metrics_calculators
        for n, k in self.metrics_c:
            if n < len(self.data_manager.datasets.keys()):
                raise ValueError('n must be equal to the number of datasets')

        self.masking_dm = None
        if masking_datasets is not None:
            # add temporal reference dataset to the masking datasets since it
            # is necessary for temporally matching the masking datasets to the
            # common time stamps. Use _reference here to make a clash with the
            # names of the masking datasets unlikely
            masking_datasets.update(
                {'_reference': datasets[self.temporal_ref]})
            self.masking_dm = DataManager(masking_datasets,
                                          '_reference',
                                          period=period)

        if type(scaling) == str:
            self.scaling = DefaultScaler(scaling)
        else:
            self.scaling = scaling
        self.scaling_ref = scaling_ref
        if self.scaling_ref is None:
            self.scaling_ref = self.data_manager.reference_name

        self.luts = self.data_manager.get_luts()

    def calc(self, gpis, lons, lats, *args):
        """
        The argument iterables (lists or numpy.ndarrays) are processed one after the other in
        tuples of the form (gpis[n], lons[n], lats[n], arg1[n], ..).

        Parameters
        ----------
        gpis: iterable
            The grid point indices is an identificator by which the
            spatial reference dataset can be read. This is either a list
            or a numpy.ndarray or any other iterable containing this indicator.
        lons: iterable
            Longitudes of the points identified by the gpis. Has to be the same size as gpis.
        lats: iterable
            latitudes of the points identified by the gpis. Has to be the same size as gpis.
        args: iterables
            any addiational arguments have to have the same size as the gpis iterable. They are
            given to the metrics calculators as metadata. Common usage is e.g. the long name
            or network name of an in situ station.

        Returns
        -------
        compact_results : dict of dicts
            :Keys: result names, combinations of
                  (referenceDataset.column, otherDataset.column)
            :Values: dict containing the elements returned by metrics_calculator

        """
        results = {}
        if len(args) > 0:
            gpis, lons, lats, args = args_to_iterable(gpis,
                                                      lons,
                                                      lats,
                                                      *args,
                                                      n=3)
        else:
            gpis, lons, lats = args_to_iterable(gpis, lons, lats)

        for gpi_info in zip(gpis, lons, lats, *args):

            df_dict = self.data_manager.get_data(gpi_info[0], gpi_info[1],
                                                 gpi_info[2])

            # if no data is available continue with the next gpi
            if len(df_dict) == 0:
                continue
            matched_data, result, used_data = self.perform_validation(
                df_dict, gpi_info)

            # add result of one gpi to global results dictionary
            for r in result:
                if r not in results:
                    results[r] = []
                results[r] = results[r] + result[r]

        compact_results = {}

        for key in results.keys():
            compact_results[key] = {}
            for field_name in results[key][0].keys():
                entries = []
                for result in results[key]:
                    entries.append(result[field_name][0])
                compact_results[key][field_name] = \
                    np.array(entries, dtype=results[key][0][field_name].dtype)

        return compact_results

    def perform_validation(self, df_dict, gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df, gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_combinations(self.data_manager.ds_dict,
                                                   n=k)
            for data, result_key in self.k_datasets_from(
                    n_matched_data, result_names):

                if len(data) == 0:
                    continue

                # at this stage we can drop the column multiindex and just use
                # the dataset name
                if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                    data.columns = data.columns.droplevel(level=1)
                else:
                    data = data.rename(columns=lambda x: x[0])

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(
                        self.scaling_ref)
                    try:
                        data = self.scaling.scale(data, scaling_index,
                                                  gpi_info)
                    except ValueError:
                        continue
                    # Drop the scaling reference if it was not in the intended
                    # results
                    if self.scaling_ref not in [key[0] for key in result_key]:
                        data = data.drop(columns=[self.scaling_ref])

                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data

    def mask_dataset(self, ref_df, gpi_info):
        """
        Mask the temporal reference dataset with the data read
        through the masking datasets.

        Parameters
        ----------
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        mask: numpy.ndarray
            boolean array of the size of the temporal reference read
        """

        matched_masking = self.temporal_match_masking_data(ref_df, gpi_info)
        # this will only be one element since n is the same as the
        # number of masking datasets
        result_names = get_result_names(self.masking_dm.ds_dict,
                                        '_reference',
                                        n=2)
        choose_all = pd.DataFrame(index=ref_df.index)
        for data, result in self.k_datasets_from(matched_masking,
                                                 result_names,
                                                 include_scaling_ref=False):
            if len(data) == 0:
                continue

            for key in result:
                if key[0] != '_reference':
                    # this is necessary since the boolean datatype might have
                    # been changed to float 1.0 and 0.0 issue with temporal
                    # resampling that is not easily resolved since most
                    # datatypes have no nan representation.
                    choose = pd.Series((data[key] == False), index=data.index)
                    choose = choose.reindex(index=choose_all.index,
                                            fill_value=True)
                    choose_all[key] = choose.copy()
        choosing = choose_all.apply(np.all, axis=1)

        return ref_df[choosing]

    def temporal_match_masking_data(self, ref_df, gpi_info):
        """
        Temporal match the masking data to the reference DataFrame

        Parameters
        ----------
        ref_df: pandas.DataFrame
            Reference data
        gpi_info: tuple or list
            contains, (gpi, lon, lat)

        Returns
        -------
        matched_masking: dict of pandas.DataFrames
            Contains temporally matched masking data. This dict has only one key
            being a tuple that contains the matched datasets.
        """

        # read only masking datasets and use the already read reference
        masking_df_dict = self.masking_dm.get_other_data(
            gpi_info[0], gpi_info[1], gpi_info[2])
        masking_df_dict.update({'_reference': ref_df})
        matched_masking = self.temp_matching(masking_df_dict,
                                             '_reference',
                                             n=2)
        return matched_masking

    def temporal_match_datasets(self, df_dict):
        """
        Temporally match all the requested combinations of datasets.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            for each (n, k) in the metrics calculators the n temporally
            matched dataframes
        """

        matched_n = {}
        for n, k in self.metrics_c:
            matched_data = self.temp_matching(df_dict, self.temporal_ref, n=n)

            matched_n[(n, k)] = matched_data

        return matched_n

    def k_datasets_from(self,
                        n_matched_data,
                        result_names,
                        include_scaling_ref=True):
        """
        Extract k datasets from n temporally matched ones.

        This is used to send combinations of k datasets to
        metrics calculators expecting only k datasets.

        Parameters
        ----------
        n_matched_data: dict of pandas.DataFrames
            DataFrames in which n datasets were temporally matched.
            The key is a tuple of the dataset names.
        result_names: list
            result names to extract
        include_scaling_ref: boolean, optional
            if set the scaling reference will always be included.
            Should only be disabled for getting the masking datasets

        Yields
        ------
        data: pd.DataFrame
            pandas DataFrame with k columns extracted from the
            temporally matched datasets
        result: tuple
            Tuple describing which datasets and columns are in
            the returned data. ((dataset_name, column_name), (dataset_name2, column_name2))
        """

        for result in result_names:
            result_extract = result
            if self.scaling is not None and include_scaling_ref:
                # always make sure the scaling reference is included in the results
                # otherwise the scaling will fail
                scaling_ref_column = self.data_manager.datasets[
                    self.scaling_ref]['columns'][0]
                scaling_result_name = (self.scaling_ref, scaling_ref_column)
                if scaling_result_name not in result:
                    result_extract = result + (scaling_result_name, )
            data = self.get_data_for_result_tuple(n_matched_data,
                                                  result_extract)
            yield data, result

    def get_data_for_result_tuple(self, n_matched_data, result_tuple):
        """
        Extract a dataframe for a given result tuple from the
        matched dataframes.

        Parameters
        ----------
        n_matched_data: dict of pandas.DataFrames
            DataFrames in which n datasets were temporally matched.
            The key is a tuple of the dataset names.
        result_tuple: tuple
            Tuple describing which datasets and columns should be
            extracted. ((dataset_name, column_name), (dataset_name2, column_name2))

        Returns
        -------
        data: pd.DataFrame
            pandas DataFrame with columns extracted from the
            temporally matched datasets
        """
        # find the key into the temporally matched dataset by combining the
        # dataset parts of the result_names
        dskey = []
        for i, r in enumerate(result_tuple):
            dskey.append(r[0])

        dskey = tuple(dskey)
        if len(list(n_matched_data)[0]) == len(dskey):
            # we should have an exact match of datasets and
            # temporal matches

            try:
                # still need to make sure that dskey is in the right order and
                # contains all the same datasets as the n_matched_data
                if sorted(dskey) == sorted(list(n_matched_data.keys())[0]):
                    dskey = list(n_matched_data.keys())[0]

                data = n_matched_data[dskey]
            except KeyError:
                # if not then temporal matching between two datasets was
                # unsuccessful
                return []
        else:
            # more datasets were temporally matched than are
            # requested now so we select a temporally matched
            # dataset that has the first key in common with the
            # temporal reference.

            # This guarantees that we only select columns from dataframes for
            # which the temporal reference dataset was included in the temporal
            # matching

            first_match = [
                key for key in n_matched_data if self.temporal_ref == key[0]
            ]
            found_key = None
            for key in first_match:
                for dsk in dskey:
                    if dsk not in key:
                        continue
                found_key = key
            data = n_matched_data[found_key]

        # extract only the relevant columns from matched DataFrame
        data = data[[x for x in result_tuple]]
        # drop values if one column is NaN
        data = data.dropna()
        return data

    def get_processing_jobs(self):
        """
        Returns processing jobs that this process can understand.

        Returns
        -------
        jobs : list
            List of cells or gpis to process.
        """
        jobs = []
        if self.data_manager.reference_grid is not None:
            if type(self.data_manager.reference_grid) is CellGrid:
                cells = self.data_manager.reference_grid.get_cells()
                for cell in cells:
                    (cell_gpis, cell_lons, cell_lats
                     ) = self.data_manager.reference_grid.grid_points_for_cell(
                         cell)
                    jobs.append([cell_gpis, cell_lons, cell_lats])
            else:
                gpis, lons, lats = self.data_manager.reference_grid.get_grid_points(
                )
                jobs = [gpis, lons, lats]

        return jobs
Esempio n. 17
0
def test_DataManager_get_data():

    datasets = setup_TestDatasets()
    dm = DataManager(datasets, 'DS1')
    data = dm.get_data(1, 1, 1)
    assert sorted(list(data)) == ['DS1', 'DS2', 'DS3']
Esempio n. 18
0
class Validation(object):

    """
    Class for the validation process.

    Parameters
    ----------
    datasets : dict of dicts
        Keys: string, datasets names
        Values: dict, containing the following fields
            'class': object
                Class containing the method read_ts for reading the data.
            'columns': list
                List of columns which will be used in the validation process.
            'type': string
                'reference' or 'other'.
            'args': list, optional
                Args for reading the data.
            'kwargs': dict, optional
                Kwargs for reading the data
            'grids_compatible': boolean, optional
                If set to True the grid point index is used directly when
                reading other, if False then lon, lat is used and a nearest
                neighbour search is necessary.
            'use_lut': boolean, optional
                If set to True the grid point index (obtained from a
                calculated lut between reference and other) is used when
                reading other, if False then lon, lat is used and a
                nearest neighbour search is necessary.
            'lut_max_dist': float, optional
                Maximum allowed distance in meters for the lut calculation.
    temporal_matcher: object
        Class instance that has a match method that takes a reference and a
        other DataFrame. It's match method should return a DataFrame with the
        index of the reference DataFrame and all columns of both DataFrames.
    metrics_calculator : object
        Class that has a calc_metrics method that takes a pandas.DataFrame
        with 2 columns named 'ref' and 'other' and returns a dictionary with
        the calculated metrics.
    data_prep: object
        Object that provides the methods prep_reference and prep_other
        which take the pandas.Dataframe provided by the read_ts methods (plus
        other_name for prep_other) and do some data preparation on it before
        temporal matching etc. can be used e.g. for special masking or anomaly
        calculations.
    period : list, optional
        Of type [datetime start, datetime end]. If given then the two input
        datasets will be truncated to start <= dates <= end.
    scaling : string
        If set then the data will be scaled into the reference space using the
        method specified by the string.
    scale_to_other : boolean, optional
        If True the reference dataset is scaled to the other dataset instead
        of the default behavior.
    cell_based_jobs : boolean, optional
        If True then the jobs will be cell based, if false jobs will be tuples
        of (gpi, lon, lat).

    Methods
    -------
    calc(job)
        Takes either a cell or a gpi_info tuple and performs the validation.
    get_processing_jobs()
        Returns processing jobs that this process can understand.
    """

    def __init__(self, datasets, temporal_matcher, metrics_calculator,
                 data_prep=None, data_post=None, period=None,
                 scaling='lin_cdf_match', scale_to_other=False,
                 cell_based_jobs=True):
        """
        Initialize parameters.
        """
        self.data_manager = DataManager(datasets, data_prep, period)

        self.temp_matching = temporal_matcher.match
        self.calc_metrics = metrics_calculator.calc_metrics
        self.data_postproc = data_post

        self.scaling = scaling
        self.scale_to_index = 0
        if scale_to_other:
            self.scale_to_index = 1

        self.cell_based_jobs = cell_based_jobs

        self.luts = self.data_manager.get_luts()

    def calc(self, job):
        """
        Takes either a cell or a gpi_info tuple and performs the validation.

        Parameters
        ----------
        job : object
            Job of type that self.get_processing_jobs() returns.

        Returns
        -------
        compact_results : dict of dicts
            Keys: result names, combinations of
                  (referenceDataset.column, otherDataset.column)
            Values: dict containing the elements returned by metrics_calculator
        """
        result_names = self.data_manager.get_results_names()
        results = {}

        if self.cell_based_jobs:
            process_gpis, process_lons, process_lats = self.data_manager.\
                reference_grid.grid_points_for_cell(job)
        else:
            process_gpis, process_lons, process_lats = [
                job[0]], [job[1]], [job[2]]

        for gpi_info in zip(process_gpis, process_lons, process_lats):
            # if processing is cell based gpi_metainfo is limited to gpi, lon,
            # lat at the moment
            if self.cell_based_jobs:
                gpi_meta = gpi_info
            else:
                gpi_meta = job

            ref_dataframe = self.data_manager.read_reference(gpi_info[0])
            # if no reference data available continue with the next gpi
            if ref_dataframe is None:
                continue

            other_dataframes = {}
            for other_name in self.data_manager.other_name:
                grids_compatible = self.data_manager.datasets[
                    other_name]['grids_compatible']
                if grids_compatible:
                    other_dataframe = self.data_manager.read_other(
                        other_name, gpi_info[0])
                elif self.luts[other_name] is not None:
                    other_gpi = self.luts[other_name][gpi_info[0]]
                    if other_gpi == -1:
                        continue
                    other_dataframe = self.data_manager.read_other(
                        other_name, other_gpi)
                else:
                    other_dataframe = self.data_manager.read_other(
                        other_name, gpi_info[1], gpi_info[2])

                if other_dataframe is not None:
                    other_dataframes[other_name] = other_dataframe

            # if no other data available continue with the next gpi
            if len(other_dataframes) == 0:
                continue

            joined_data = {}
            for other in other_dataframes.keys():
                joined = self.temp_matching(ref_dataframe,
                                            other_dataframes[other])

                if len(joined) != 0:
                    joined_data[other] = joined

            if len(joined_data) == 0:
                continue

            # compute results for each combination of (ref, other) columns
            rescaled_data = {}
            for result in result_names:
                ref_col = result[0].split('.')[1]
                other_col = result[1].split('.')[1]
                other_name = result[1].split('.')[0]

                try:
                    data = joined_data[other_name][
                        [ref_col, other_col]].dropna()
                except KeyError:
                    continue

                data.rename(
                    columns={ref_col: 'ref', other_col: 'other'}, inplace=True)

                if len(data) == 0:
                    continue

                if self.scaling is not None:
                    try:
                        data = scaling.scale(
                            data, method=self.scaling, reference_index=self.scale_to_index)
                        rescaled_data[other_name] = data
                    except ValueError:
                        continue

                if result not in results.keys():
                    results[result] = []

                results[result].append(self.calc_metrics(data, gpi_meta))

        compact_results = {}
        for key in results.keys():
            compact_results[key] = {}
            for field_name in results[key][0].keys():
                entries = []
                for result in results[key]:
                    entries.append(result[field_name][0])
                compact_results[key][field_name] = \
                    np.array(entries, dtype=results[key][0][field_name].dtype)

        if self.data_postproc is not None:
            self.data_postproc(compact_results, rescaled_data)

        return compact_results

    def get_processing_jobs(self):
        """
        Returns processing jobs that this process can understand.

        Returns
        -------
        jobs : list
            List of cells or gpis to process.
        """
        if self.data_manager.reference_grid is not None:
            if self.cell_based_jobs:
                return self.data_manager.reference_grid.get_cells()
            else:
                return zip(self.data_manager.reference_grid.get_grid_points())
        else:
            return []
Esempio n. 19
0
def create_pytesmo_validation(validation_run):
    ds_list = []
    ref_name = None
    scaling_ref_name = None

    ds_num = 1
    for dataset_config in validation_run.dataset_configurations.all():
        reader = create_reader(dataset_config.dataset, dataset_config.version)
        reader = setup_filtering(
            reader, list(dataset_config.filters.all()),
            list(dataset_config.parametrisedfilter_set.all()),
            dataset_config.dataset, dataset_config.variable)

        if validation_run.anomalies == ValidationRun.MOVING_AVG_35_D:
            reader = AnomalyAdapter(
                reader,
                window_size=35,
                columns=[dataset_config.variable.pretty_name])
        if validation_run.anomalies == ValidationRun.CLIMATOLOGY:
            # make sure our baseline period is in UTC and without timezone information
            anomalies_baseline = [
                validation_run.anomalies_from.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None),
                validation_run.anomalies_to.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None)
            ]
            reader = AnomalyClimAdapter(
                reader,
                columns=[dataset_config.variable.pretty_name],
                timespan=anomalies_baseline)

        if (validation_run.reference_configuration and
            (dataset_config.id == validation_run.reference_configuration.id)):
            # reference is always named "0-..."
            dataset_name = '{}-{}'.format(0, dataset_config.dataset.short_name)
        else:
            dataset_name = '{}-{}'.format(ds_num,
                                          dataset_config.dataset.short_name)
            ds_num += 1

        ds_list.append((dataset_name, {
            'class': reader,
            'columns': [dataset_config.variable.pretty_name]
        }))

        if (validation_run.reference_configuration and
            (dataset_config.id == validation_run.reference_configuration.id)):
            ref_name = dataset_name
            ref_short_name = validation_run.reference_configuration.dataset.short_name

        if (validation_run.scaling_ref
                and (dataset_config.id == validation_run.scaling_ref.id)):
            scaling_ref_name = dataset_name

    datasets = dict(ds_list)
    ds_num = len(ds_list)

    period = None
    if validation_run.interval_from is not None and validation_run.interval_to is not None:
        # while pytesmo can't deal with timezones, normalise the validation period to utc; can be removed once pytesmo can do timezones
        startdate = validation_run.interval_from.astimezone(UTC).replace(
            tzinfo=None)
        enddate = validation_run.interval_to.astimezone(UTC).replace(
            tzinfo=None)
        period = [startdate, enddate]

    upscale_parms = None
    if validation_run.upscaling_method != "none":
        __logger.debug("Upscaling option is active")
        upscale_parms = {
            "upscaling_method": validation_run.upscaling_method,
            "temporal_stability": validation_run.temporal_stability,
        }
        upscaling_lut = create_upscaling_lut(
            validation_run=validation_run,
            datasets=datasets,
            ref_name=ref_name,
        )
        upscale_parms["upscaling_lut"] = upscaling_lut
        __logger.debug("Lookup table for non-reference datasets " +
                       ", ".join(upscaling_lut.keys()) + " created")
        __logger.debug("{}".format(upscaling_lut))

    datamanager = DataManager(
        datasets,
        ref_name=ref_name,
        period=period,
        read_ts_names='read',
        upscale_parms=upscale_parms,
    )
    ds_names = get_dataset_names(datamanager.reference_name,
                                 datamanager.datasets,
                                 n=ds_num)

    # set value of the metadata template according to what reference dataset is used
    if ref_short_name == 'ISMN':
        metadata_template = METADATA_TEMPLATE['ismn_ref']
    else:
        metadata_template = METADATA_TEMPLATE['other_ref']

    pairwise_metrics = PairwiseIntercomparisonMetrics(
        metadata_template=metadata_template,
        calc_kendall=False,
    )

    metric_calculators = {(ds_num, 2): pairwise_metrics.calc_metrics}

    if (len(ds_names) >= 3) and (validation_run.tcol is True):
        tcol_metrics = TripleCollocationMetrics(
            ref_name,
            metadata_template=metadata_template,
        )
        metric_calculators.update({(ds_num, 3): tcol_metrics.calc_metrics})

    if validation_run.scaling_method == validation_run.NO_SCALING:
        scaling_method = None
    else:
        scaling_method = validation_run.scaling_method

    __logger.debug(f"Scaling method: {scaling_method}")
    __logger.debug(f"Scaling dataset: {scaling_ref_name}")

    val = Validation(datasets=datamanager,
                     temporal_matcher=make_combined_temporal_matcher(
                         pd.Timedelta(12, "H")),
                     spatial_ref=ref_name,
                     scaling=scaling_method,
                     scaling_ref=scaling_ref_name,
                     metrics_calculators=metric_calculators,
                     period=period)

    return val
Esempio n. 20
0
class Validation(object):

    """
    Class for the validation process.

    Parameters
    ----------
    datasets : dict of dicts, or pytesmo.validation_framwork.data_manager.DataManager
        Keys: string, datasets names
        Values: dict, containing the following fields
            'class': object
                Class containing the method read_ts for reading the data.
            'columns': list
                List of columns which will be used in the validation process.
            'args': list, optional
                Args for reading the data.
            'kwargs': dict, optional
                Kwargs for reading the data
            'grids_compatible': boolean, optional
                If set to True the grid point index is used directly when
                reading other, if False then lon, lat is used and a nearest
                neighbour search is necessary.
            'use_lut': boolean, optional
                If set to True the grid point index (obtained from a
                calculated lut between reference and other) is used when
                reading other, if False then lon, lat is used and a
                nearest neighbour search is necessary.
            'lut_max_dist': float, optional
                Maximum allowed distance in meters for the lut calculation.
    spatial_ref: string
        Name of the dataset used as a spatial, temporal and scaling reference.
        temporal and scaling references can be changed if needed. See the optional parameters
        ``temporal_ref`` and ``scaling_ref``.
    metrics_calculators : dict of functions
        The keys of the dict are tuples with the following structure: (n, k) with n >= 2
        and n>=k. n is the number of datasets that should be temporally matched to the
        reference dataset and k is how many columns the metric calculator will get at once.
        What this means is that it is e.g. possible to temporally match 3 datasets with
        3 columns in total and then give the combinations of these columns to the metric
        calculator in sets of 2 by specifying the dictionary like:

        .. code::

            { (3, 2): metric_calculator}

        The values are functions that take an input DataFrame with the columns 'ref'
        for the reference and 'n1', 'n2' and
        so on for other datasets as well as a dictionary mapping the column names
        to the names of the original datasets. In this way multiple metric calculators
        can be applied to different combinations of n input datasets.
    temporal_matcher: function, optional
        function that takes a dict of dataframes and a reference_key.
        It performs the temporal matching on the data and returns a dictionary
        of matched DataFrames that should be evaluated together by the metric calculator.
    temporal_window: float, optional
        Window to allow in temporal matching in days. The window is allowed on both
        sides of the timestamp of the temporal reference data.
        Only used with the standard temporal matcher.
    temporal_ref: string, optional
        If the temporal matching should use another dataset than the spatial reference
        as a reference dataset then give the dataset name here.
    period : list, optional
        Of type [datetime start, datetime end]. If given then the two input
        datasets will be truncated to start <= dates <= end.
    masking_datasets : dict of dictionaries
        Same format as the datasets with the difference that the read_ts method of these
        datasets has to return pandas.DataFrames with only boolean columns. True means that the
        observations at this timestamp should be masked and False means that it should be kept.
    scaling : string, None or class instance
        - If set then the data will be scaled into the reference space using the
          method specified by the string using the
          :py:class:`pytesmo.validation_framework.data_scalers.DefaultScaler` class.
        - If set to None then no scaling will be performed.
        - It can also be set to a class instance that implements a
          ``scale(self, data, reference_index, gpi_info)`` method. See
          :py:class:`pytesmo.validation_framework.data_scalers.DefaultScaler` for an example.
    scaling_ref : string, optional
        If the scaling should be done to another dataset than the spatial reference then
        give the dataset name here.

    Methods
    -------
    calc(job)
        Takes either a cell or a gpi_info tuple and performs the validation.
    get_processing_jobs()
        Returns processing jobs that this process can understand.
    """

    def __init__(self, datasets, spatial_ref, metrics_calculators,
                 temporal_matcher=None, temporal_window=1 / 24.0,
                 temporal_ref=None,
                 masking_datasets=None,
                 period=None,
                 scaling='lin_cdf_match', scaling_ref=None):

        if type(datasets) is DataManager:
            self.data_manager = datasets
        else:
            self.data_manager = DataManager(datasets, spatial_ref, period)

        self.temp_matching = temporal_matcher
        if self.temp_matching is None:
            self.temp_matching = temporal_matchers.BasicTemporalMatching(
                window=temporal_window).combinatory_matcher

        self.temporal_ref = temporal_ref
        if self.temporal_ref is None:
            self.temporal_ref = self.data_manager.reference_name

        self.metrics_c = metrics_calculators

        self.masking_dm = None
        if masking_datasets is not None:
            # add temporal reference dataset to the masking datasets since it
            # is necessary for temporally matching the masking datasets to the
            # common time stamps. Use _reference here to make a clash with the
            # names of the masking datasets unlikely
            masking_datasets.update(
                {'_reference': datasets[self.temporal_ref]})
            self.masking_dm = DataManager(masking_datasets, '_reference',
                                          period=period)

        if type(scaling) == str:
            self.scaling = DefaultScaler(scaling)
        else:
            self.scaling = scaling
        self.scaling_ref = scaling_ref
        if self.scaling_ref is None:
            self.scaling_ref = self.data_manager.reference_name

        self.luts = self.data_manager.get_luts()

    def calc(self, gpis, lons, lats, *args):
        """
        The argument iterables (lists or numpy.ndarrays) are processed one after the other in
        tuples of the form (gpis[n], lons[n], lats[n], arg1[n], ..).

        Parameters
        ----------
        gpis : iterable
            The grid point indices is an identificator by which the
            spatial reference dataset can be read. This is either a list
            or a numpy.ndarray or any other iterable containing this indicator.
        lons: iterable
            Longitudes of the points identified by the gpis. Has to be the same size as gpis.
        lats: iterable
            latitudes of the points identified by the gpis. Has to be the same size as gpis.
        args: iterables
            any addiational arguments have to have the same size as the gpis iterable. They are
            given to the metrics calculators as metadata. Common usage is e.g. the long name
            or network name of an in situ station.

        Returns
        -------
        compact_results : dict of dicts
            Keys: result names, combinations of
                  (referenceDataset.column, otherDataset.column)
            Values: dict containing the elements returned by metrics_calculator
        """
        results = {}
        if len(args) > 0:
            gpis, lons, lats, args = args_to_iterable(gpis,
                                                      lons,
                                                      lats,
                                                      *args,
                                                      n=3)
        else:
            gpis, lons, lats = args_to_iterable(gpis, lons, lats)

        for gpi_info in zip(gpis, lons, lats, *args):

            df_dict = self.data_manager.get_data(gpi_info[0],
                                                 gpi_info[1],
                                                 gpi_info[2])

            # if no data is available continue with the next gpi
            if len(df_dict) == 0:
                continue
            matched_data, result, used_data = self.perform_validation(
                df_dict, gpi_info)

            # add result of one gpi to global results dictionary
            for r in result:
                if r not in results:
                    results[r] = []
                results[r] = results[r] + result[r]

        compact_results = {}
        for key in results.keys():
            compact_results[key] = {}
            for field_name in results[key][0].keys():
                entries = []
                for result in results[key]:
                    entries.append(result[field_name][0])
                compact_results[key][field_name] = \
                    np.array(entries, dtype=results[key][0][field_name].dtype)

        return compact_results

    def perform_validation(self,
                           df_dict,
                           gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df,
                                              gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_names(self.data_manager.ds_dict,
                                            self.temporal_ref,
                                            n=k)
            for data, result_key in self.k_datasets_from(n_matched_data,
                                                         result_names):

                if len(data) == 0:
                    continue

                # at this stage we can drop the column multiindex and just use
                # the dataset name
                if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                    data.columns = data.columns.droplevel(level=1)
                else:
                    data = data.rename(columns=lambda x: x[0])

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(self.scaling_ref)
                    try:
                        data = self.scaling.scale(data,
                                                  scaling_index,
                                                  gpi_info)
                    except ValueError:
                        continue
                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data

    def mask_dataset(self, ref_df, gpi_info):
        """
        Mask the temporal reference dataset with the data read
        through the masking datasets.

        Parameters
        ----------
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        mask: numpy.ndarray
            boolean array of the size of the temporal reference read
        """

        matched_masking = self.temporal_match_masking_data(ref_df, gpi_info)
        # this will only be one element since n is the same as the
        # number of masking datasets
        result_names = get_result_names(self.masking_dm.ds_dict,
                                        '_reference',
                                        n=2)
        choose_all = pd.DataFrame(index=ref_df.index)
        for data, result in self.k_datasets_from(matched_masking,
                                                 result_names):
            if len(data) == 0:
                continue

            for key in result:
                if key[0] != '_reference':
                    # this is necessary since the boolean datatype might have
                    # been changed to float 1.0 and 0.0 issue with temporal
                    # resampling that is not easily resolved since most
                    # datatypes have no nan representation.
                    choose = pd.Series((data[key] == False), index=data.index)
                    choose = choose.reindex(index=choose_all.index,
                                            fill_value=True)
                    choose_all[key] = choose.copy()
        choosing = choose_all.apply(np.all, axis=1)

        return ref_df[choosing]

    def temporal_match_masking_data(self, ref_df, gpi_info):
        """
        Temporal match the masking data to the reference DataFrame

        Parameters
        ----------
        ref_df: pandas.DataFrame
            Reference data
        gpi_info: tuple or list
            contains, (gpi, lon, lat)

        Returns
        -------
        matched_masking: dict of pandas.DataFrames
            Contains temporally matched masking data. This dict has only one key
            being a tuple that contains the matched datasets.
        """

        # read only masking datasets and use the already read reference
        masking_df_dict = self.masking_dm.get_other_data(gpi_info[0],
                                                         gpi_info[1],
                                                         gpi_info[2])
        masking_df_dict.update({'_reference': ref_df})
        matched_masking = self.temp_matching(masking_df_dict,
                                             '_reference',
                                             n=2)
        return matched_masking

    def temporal_match_datasets(self, df_dict):
        """
        Temporally match all the requested combinations of datasets.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            for each (n, k) in the metrics calculators the n temporally
            matched dataframes
        """

        matched_n = {}
        for n, k in self.metrics_c:
            matched_data = self.temp_matching(df_dict,
                                              self.temporal_ref,
                                              n=n)

            matched_n[(n, k)] = matched_data

        return matched_n

    def k_datasets_from(self, n_matched_data, result_names):
        """
        Extract k datasets from n temporally matched ones.

        This is used to send combinations of k datasets to
        metrics calculators expecting only k datasets.

        Parameters
        ----------
        n_matched_data: dict of pandas.DataFrames
            DataFrames in which n datasets were temporally matched.
            The key is a tuple of the dataset names.
        result_names: list
            result names to extract

        Yields
        ------
        data: pd.DataFrame
            pandas DataFrame with k columns extracted from the
            temporally matched datasets
        result: tuple
            Tuple describing which datasets and columns are in
            the returned data. ((dataset_name, column_name), (dataset_name2, column_name2))
        """

        for result in result_names:
            data = self.get_data_for_result_tuple(n_matched_data, result)
            yield data, result

    def get_data_for_result_tuple(self, n_matched_data, result_tuple):
        """
        Extract a dataframe for a given result tuple from the
        matched dataframes.

        Parameters
        ----------
        n_matched_data: dict of pandas.DataFrames
            DataFrames in which n datasets were temporally matched.
            The key is a tuple of the dataset names.
        result_tuple: tuple
            Tuple describing which datasets and columns should be
            extracted. ((dataset_name, column_name), (dataset_name2, column_name2))

        Returns
        -------
        data: pd.DataFrame
            pandas DataFrame with columns extracted from the
            temporally matched datasets
        """
        # find the key into the temporally matched dataset by combining the
        # dataset parts of the result_names
        dskey = []
        for i, r in enumerate(result_tuple):
            dskey.append(r[0])

        dskey = tuple(dskey)
        if len(list(n_matched_data)[0]) == len(dskey):
            # we should have an exact match of datasets and
            # temporal matches
            try:
                data = n_matched_data[dskey]
            except KeyError:
                # if not then temporal matching between two datasets was
                # unsuccessful
                return []
        else:
            # more datasets were temporally matched than are
            # requested now so we select a temporally matched
            # dataset that has the first key in common with the
            # requested one ensuring that it was used as a
            # reference and also has the rest of the requested
            # datasets in the key
            first_match = [
                key for key in n_matched_data if dskey[0] == key[0]]
            found_key = None
            for key in first_match:
                for dsk in dskey[1:]:
                    if dsk not in key:
                        continue
                found_key = key
            data = n_matched_data[found_key]

        # extract only the relevant columns from matched DataFrame
        data = data[[x for x in result_tuple]]
        # drop values if one column is NaN
        data = data.dropna()
        return data

    def get_processing_jobs(self):
        """
        Returns processing jobs that this process can understand.

        Returns
        -------
        jobs : list
            List of cells or gpis to process.
        """
        jobs = []
        if self.data_manager.reference_grid is not None:
            if type(self.data_manager.reference_grid) is CellGrid:
                cells = self.data_manager.reference_grid.get_cells()
                for cell in cells:
                    (cell_gpis,
                     cell_lons,
                     cell_lats) = self.data_manager.reference_grid.grid_points_for_cell(cell)
                    jobs.append([cell_gpis, cell_lons, cell_lats])
            else:
                gpis, lons, lats = self.data_manager.reference_grid.get_grid_points()
                jobs = [gpis, lons, lats]

        return jobs
Esempio n. 21
0
def test_ascat_ismn_validation_metadata_rolling(ascat_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    # Initialize ISMN reader
    ismn_data_folder = os.path.join(
        os.path.dirname(__file__),
        "..",
        "test-data",
        "ismn",
        "multinetwork",
        "header_values",
    )
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable="soil moisture", min_depth=0, max_depth=0.1
    )

    metadata_dict_template = {
        "network": np.array(["None"], dtype="U256"),
        "station": np.array(["None"], dtype="U256"),
        "landcover": np.float32([np.nan]),
        "climate": np.array(["None"], dtype="U4"),
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [
            {
                "network": metadata["network"],
                "station": metadata["station"],
                "landcover": metadata["landcover_2010"],
                "climate": metadata["climate"],
            }
        ]
        jobs.append(
            (idx, metadata["longitude"], metadata["latitude"], metadata_dict)
        )

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {"class": ismn_reader, "columns": ["soil moisture"]},
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(
        datasets, "ISMN", period, read_ts_names=read_ts_names
    )

    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2): metrics_calculators.RollingMetrics(
                other_name="k1", metadata_template=metadata_dict_template
            ).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(
            results, save_path, ts_vars=["R", "p_R", "RMSD"]
        )

    results_fname = os.path.join(
        save_path, "ASCAT.sm_with_ISMN.soil moisture.nc"
    )

    vars_should = [
        u"gpi",
        u"lon",
        u"lat",
        u"R",
        u"p_R",
        u"time",
        u"idx",
        u"_row_size",
    ]

    for key, value in metadata_dict_template.items():
        vars_should.append(key)

    network_should = np.array(
        [
            "MAQU",
            "MAQU",
            "SCAN",
            "SCAN",
            "SCAN",
            "SOILSCAPE",
            "SOILSCAPE",
            "SOILSCAPE",
        ],
        dtype="U256",
    )

    reader = PointDataResults(results_fname, read_only=True)
    df = reader.read_loc(None)
    nptest.assert_equal(sorted(network_should), sorted(df["network"].values))
    assert np.all(df.gpi.values == np.arange(8))
    assert reader.read_ts(0).index.size == 357
    assert np.all(
        reader.read_ts(1).columns.values == np.array(["R", "p_R", "RMSD"])
    )
Esempio n. 22
0
def test_ascat_ismn_validation(ascat_reader, ismn_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    jobs = []

    ids = ismn_reader.get_dataset_ids(variable="soil moisture",
                                      min_depth=0,
                                      max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata["longitude"], metadata["latitude"]))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"]
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           "ISMN",
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")
    # targets
    target_vars = {
        "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251],
        "rho":
        np.array([
            0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666,
            0.6740655, 0.8418981, 0.74206454
        ],
                 dtype=np.float32),
        "RMSD":
        np.array([
            11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225,
            13.0622425, 12.903898
        ],
                 dtype=np.float32)
    }

    check_results(
        filename=results_fname,
        target_vars=target_vars,
    )
Esempio n. 23
0
def test_ascat_ismn_validation_metadata(ascat_reader, ismn_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    jobs = []

    ids = ismn_reader.get_dataset_ids(variable="soil moisture",
                                      min_depth=0,
                                      max_depth=0.1)

    metadata_dict_template = {
        "network": np.array(["None"], dtype="U256"),
        "station": np.array(["None"], dtype="U256"),
        "landcover": np.float32([np.nan]),
        "climate": np.array(["None"], dtype="U4"),
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [{
            "network": metadata["network"],
            "station": metadata["station"],
            "landcover": metadata["landcover_2010"],
            "climate": metadata["climate"],
        }]
        jobs.append(
            (idx, metadata["longitude"], metadata["latitude"], metadata_dict))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"],
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           "ISMN",
                           period,
                           read_ts_names=read_ts_names)
    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(
                other_name="k1",
                metadata_template=metadata_dict_template).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")
    target_vars = {
        "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251],
        "rho":
        np.array([
            0.53934574,
            0.7002289,
            0.62200236,
            0.53647155,
            0.30413666,
            0.6740655,
            0.8418981,
            0.74206454,
        ],
                 dtype=np.float32),
        "RMSD":
        np.array([
            11.583476,
            7.729667,
            17.441547,
            21.125721,
            14.31557,
            14.187225,
            13.0622425,
            12.903898,
        ],
                 dtype=np.float32),
        "network":
        np.array(
            [
                "MAQU",
                "MAQU",
                "SCAN",
                "SCAN",
                "SCAN",
                "SOILSCAPE",
                "SOILSCAPE",
                "SOILSCAPE",
            ],
            dtype="U256",
        )
    }
    vars_should = [
        'BIAS', 'R', 'RMSD', '_row_size', 'climate', 'gpi', 'idx', 'landcover',
        'lat', 'lon', 'n_obs', 'network', 'p_R', 'p_rho', 'p_tau', 'rho',
        'station', 'tau', 'time'
    ]

    check_results(filename=results_fname,
                  target_vars=target_vars,
                  variables=vars_should)
Esempio n. 24
0
def test_validation_with_averager(ascat_reader, ismn_reader):
    """
    Test processing framework with averaging module. ASCAT and ISMN data are used here with no geographical
    considerations (the lut is provided more upstream and contains this information already)
    """
    while hasattr(ascat_reader, 'cls'):
        ascat_reader = ascat_reader.cls
    # lookup table between the ascat and ismn points - not geographically correct
    upscaling_lut = {
        "ISMN": {
            1814367: [(0, 102.1333, 33.8833), (1, 102.1333, 33.6666)],
            1803695: [(2, -86.55, 34.783), (3, -97.083, 37.133),
                      (4, -105.417, 34.25)],
            1856312: [(5, -120.9675, 38.43003), (6, -120.78559, 38.14956),
                      (7, -120.80639, 38.17353)]
        }
    }
    gpis = (1814367, 1803695, 1856312)
    lons, lats = [], []
    for gpi in gpis:
        lon, lat = ascat_reader.grid.gpi2lonlat(gpi)
        lons.append(lon)
        lats.append(lat)

    jobs = [(gpis, lons, lats)]

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            }
        },
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"],
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(
        datasets,
        "ASCAT",
        period,
        read_ts_names=read_ts_names,
        upscale_parms={
            "upscaling_method": "average",
            "temporal_stability": True,
            "upscaling_lut": upscaling_lut,
        },
    )
    process = Validation(
        datasets,
        "ASCAT",
        temporal_ref="ISMN",
        scaling="lin_cdf_match",
        scaling_ref="ISMN",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")

    target_vars = {
        "n_obs": [764, 2392, 904],
        "rho": np.array([-0.012487, 0.255156, 0.635517], dtype=np.float32),
        "RMSD": np.array([0.056428, 0.056508, 0.116294], dtype=np.float32),
        "R": np.array([-0.012335, 0.257671, 0.657239], dtype=np.float32)
    }

    check_results(
        filename=results_fname,
        target_vars=target_vars,
    )
Esempio n. 25
0
def test_ascat_ismn_validation():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     'grid')

    static_layers_folder = os.path.join(os.path.dirname(__file__), '..',
                                        'test-data', 'sat', 'h_saf',
                                        'static_layer')

    ascat_reader = AscatSsmCdr(ascat_data_folder,
                               ascat_grid_folder,
                               grid_filename='TUW_WARP5_grid_info_2_1.nc',
                               static_layer_path=static_layers_folder)
    ascat_reader.read_bulk = True

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                    'test-data', 'ismn', 'multinetwork',
                                    'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(variable='soil moisture',
                                      min_depth=0,
                                      max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata['longitude'], metadata['latitude']))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {
                'mask_frozen_prob': 80,
                'mask_snow_prob': 80,
                'mask_ssf': True
            }
        }
    }

    read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           'ISMN',
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name='k1').calc_metrics
        },
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [
        u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho',
        u'rho', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size'
    ]
    n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652]
    rho_should = np.array([
        0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741,
        0.53143877, 0.62204134
    ],
                          dtype=np.float32)

    rmsd_should = np.array([
        7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824,
        14.24668026, 21.19682884, 17.3883934
    ],
                           dtype=np.float32)
    with nc.Dataset(results_fname, mode='r') as results:
        assert sorted(list(results.variables.keys())) == sorted(vars_should)
        assert sorted(
            results.variables['n_obs'][:].tolist()) == sorted(n_obs_should)
        nptest.assert_allclose(sorted(rho_should),
                               sorted(results.variables['rho'][:]),
                               rtol=1e-4)
        nptest.assert_allclose(sorted(rmsd_should),
                               sorted(results.variables['RMSD'][:]),
                               rtol=1e-4)
Esempio n. 26
0
def create_pytesmo_validation(validation_run):
    ds_list = []
    ref_name = None
    scaling_ref_name = None

    ds_num = 1
    for dataset_config in validation_run.dataset_configurations.all():
        reader = create_reader(dataset_config.dataset, dataset_config.version)
        reader = setup_filtering(
            reader, list(dataset_config.filters.all()),
            list(dataset_config.parametrisedfilter_set.all()),
            dataset_config.dataset, dataset_config.variable)

        if validation_run.anomalies == ValidationRun.MOVING_AVG_35_D:
            reader = AnomalyAdapter(
                reader,
                window_size=35,
                columns=[dataset_config.variable.pretty_name])
        if validation_run.anomalies == ValidationRun.CLIMATOLOGY:
            # make sure our baseline period is in UTC and without timezone information
            anomalies_baseline = [
                validation_run.anomalies_from.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None),
                validation_run.anomalies_to.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None)
            ]
            reader = AnomalyClimAdapter(
                reader,
                columns=[dataset_config.variable.pretty_name],
                timespan=anomalies_baseline)

        if ((validation_run.reference_configuration) and
            (dataset_config.id == validation_run.reference_configuration.id)):
            # reference is always named "0-..."
            dataset_name = '{}-{}'.format(0, dataset_config.dataset.short_name)
        else:
            dataset_name = '{}-{}'.format(ds_num,
                                          dataset_config.dataset.short_name)
            ds_num += 1

        ds_list.append((dataset_name, {
            'class': reader,
            'columns': [dataset_config.variable.pretty_name]
        }))

        if ((validation_run.reference_configuration) and
            (dataset_config.id == validation_run.reference_configuration.id)):
            ref_name = dataset_name
        if ((validation_run.scaling_ref)
                and (dataset_config.id == validation_run.scaling_ref.id)):
            scaling_ref_name = dataset_name

    datasets = dict(ds_list)
    ds_num = len(ds_list)

    period = None
    if validation_run.interval_from is not None and validation_run.interval_to is not None:
        ## while pytesmo can't deal with timezones, normalise the validation period to utc; can be removed once pytesmo can do timezones
        startdate = validation_run.interval_from.astimezone(UTC).replace(
            tzinfo=None)
        enddate = validation_run.interval_to.astimezone(UTC).replace(
            tzinfo=None)
        period = [startdate, enddate]

    datamanager = DataManager(datasets,
                              ref_name=ref_name,
                              period=period,
                              read_ts_names='read')
    ds_names = get_dataset_names(datamanager.reference_name,
                                 datamanager.datasets,
                                 n=ds_num)

    if (len(ds_names) >= 3) and (validation_run.tcol is True):
        # if there are 3 or more dataset, do TC, exclude ref metrics
        metrics = TCMetrics(
            dataset_names=ds_names,
            tc_metrics_for_ref=False,
            other_names=['k{}'.format(i + 1) for i in range(ds_num - 1)])
    else:
        metrics = IntercomparisonMetrics(
            dataset_names=ds_names,
            other_names=['k{}'.format(i + 1) for i in range(ds_num - 1)])

    if validation_run.scaling_method == validation_run.NO_SCALING:
        scaling_method = None
    else:
        scaling_method = validation_run.scaling_method

    __logger.debug(f"Scaling method: {scaling_method}")
    __logger.debug(f"Scaling dataset: {scaling_ref_name}")

    val = Validation(datasets=datamanager,
                     spatial_ref=ref_name,
                     temporal_window=0.5,
                     scaling=scaling_method,
                     scaling_ref=scaling_ref_name,
                     metrics_calculators={
                         (ds_num, ds_num): metrics.calc_metrics
                     },
                     period=period)

    return val
Esempio n. 27
0
def test_ascat_ismn_validation_metadata_rolling():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     'grid')

    static_layers_folder = os.path.join(os.path.dirname(__file__), '..',
                                        'test-data', 'sat', 'h_saf',
                                        'static_layer')

    ascat_reader = AscatSsmCdr(ascat_data_folder,
                               ascat_grid_folder,
                               grid_filename='TUW_WARP5_grid_info_2_1.nc',
                               static_layer_path=static_layers_folder)
    ascat_reader.read_bulk = True

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                    'test-data', 'ismn', 'multinetwork',
                                    'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(variable='soil moisture',
                                      min_depth=0,
                                      max_depth=0.1)

    metadata_dict_template = {
        'network': np.array(['None'], dtype='U256'),
        'station': np.array(['None'], dtype='U256'),
        'landcover': np.float32([np.nan]),
        'climate': np.array(['None'], dtype='U4')
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [{
            'network': metadata['network'],
            'station': metadata['station'],
            'landcover': metadata['landcover_2010'],
            'climate': metadata['climate']
        }]
        jobs.append(
            (idx, metadata['longitude'], metadata['latitude'], metadata_dict))

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {
                'mask_frozen_prob': 80,
                'mask_snow_prob': 80,
                'mask_ssf': True
            }
        }
    }

    read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           'ISMN',
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2):
            metrics_calculators.RollingMetrics(
                other_name='k1',
                metadata_template=metadata_dict_template).calc_metrics
        },
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results,
                               save_path,
                               ts_vars=['R', 'p_R', 'RMSD'])

    results_fname = os.path.join(save_path,
                                 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [
        u'gpi', u'lon', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size'
    ]

    for key, value in metadata_dict_template.items():
        vars_should.append(key)

    network_should = np.array([
        'MAQU', 'MAQU', 'SCAN', 'SCAN', 'SCAN', 'SOILSCAPE', 'SOILSCAPE',
        'SOILSCAPE'
    ],
                              dtype='U256')

    reader = PointDataResults(results_fname, read_only=True)
    df = reader.read_loc(None)
    nptest.assert_equal(sorted(network_should), sorted(df['network'].values))
    assert np.all(df.gpi.values == np.arange(8))
    assert (reader.read_ts(0).index.size == 357)
    assert np.all(
        reader.read_ts(1).columns.values == np.array(['R', 'p_R', 'RMSD']))
Esempio n. 28
0
def test_ascat_ismn_validation_metadata(ascat_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    # Initialize ISMN reader

    ismn_data_folder = os.path.join(
        os.path.dirname(__file__),
        "..",
        "test-data",
        "ismn",
        "multinetwork",
        "header_values",
    )
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable="soil moisture", min_depth=0, max_depth=0.1
    )

    metadata_dict_template = {
        "network": np.array(["None"], dtype="U256"),
        "station": np.array(["None"], dtype="U256"),
        "landcover": np.float32([np.nan]),
        "climate": np.array(["None"], dtype="U4"),
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [
            {
                "network": metadata["network"],
                "station": metadata["station"],
                "landcover": metadata["landcover_2010"],
                "climate": metadata["climate"],
            }
        ]
        jobs.append(
            (idx, metadata["longitude"], metadata["latitude"], metadata_dict)
        )

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"],
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(
        datasets, "ISMN", period, read_ts_names=read_ts_names
    )
    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2): metrics_calculators.BasicMetrics(
                other_name="k1", metadata_template=metadata_dict_template
            ).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(
        save_path, "ASCAT.sm_with_ISMN.soil moisture.nc"
    )

    vars_should = [
        u"n_obs",
        u"tau",
        u"gpi",
        u"RMSD",
        u"lon",
        u"p_tau",
        u"BIAS",
        u"p_rho",
        u"rho",
        u"lat",
        u"R",
        u"p_R",
        u"time",
        u"idx",
        u"_row_size",
    ]
    for key, value in metadata_dict_template.items():
        vars_should.append(key)

    n_obs_should = [357, 384, 1646, 1875, 1915, 467, 141, 251]
    rho_should = np.array(
        [
            0.53934574,
            0.7002289,
            0.62200236,
            0.53647155,
            0.30413666,
            0.6740655,
            0.8418981,
            0.74206454,
        ],
        dtype=np.float32,
    )
    rmsd_should = np.array(
        [
            11.583476,
            7.729667,
            17.441547,
            21.125721,
            14.31557,
            14.187225,
            13.0622425,
            12.903898,
        ],
        dtype=np.float32,
    )

    network_should = np.array(
        [
            "MAQU",
            "MAQU",
            "SCAN",
            "SCAN",
            "SCAN",
            "SOILSCAPE",
            "SOILSCAPE",
            "SOILSCAPE",
        ],
        dtype="U256",
    )

    with nc.Dataset(results_fname, mode="r") as results:
        vars = results.variables.keys()
        n_obs = results.variables["n_obs"][:].tolist()
        rho = results.variables["rho"][:]
        rmsd = results.variables["RMSD"][:]
        network = results.variables["network"][:]

    assert sorted(vars) == sorted(vars_should)
    assert sorted(n_obs) == sorted(n_obs_should)
    nptest.assert_allclose(sorted(rho), sorted(rho_should), rtol=1e-4)
    nptest.assert_allclose(sorted(rmsd), sorted(rmsd_should), rtol=1e-4)
    nptest.assert_equal(sorted(network), sorted(network_should))
Esempio n. 29
0
def getdata():
    """
    handles the get request, which should contain the arguments listes under
    parameters

    Parameters
    ----------
    station_id: int
        id of station in database
    scaling: string
        chosen scaling method , for available choices see general.times_eries.scaling
    snow_depth: float
        mask snow depth greater than this value
    st_l1: float
        mask surface temperature layer1 lower than this value
    air_temp: float
        mask 2m air temperature lower than this value
    ssf_masking: boolean
        use SSF for masking true or false
    """
    station_id = request.args.get('station_id')
    scaling = request.args.get('scaling')
    if scaling == 'noscale':
        scaling = None
    masking_ids = request.args.getlist('masking_ds[]')
    masking_ops = request.args.getlist('masking_op[]')
    masking_values = request.args.getlist('masking_values[]')
    masking_values = [float(x) for x in masking_values]

    anomaly = request.args.get('anomaly')
    if anomaly == 'none':
        anomaly = None

    (depth_from,
     depth_to,
     sensor_id) = get_station_first_sm_layer(app.config['ISMN_PATH'],
                                             station_id)
    lon, lat = get_station_lonlat(app.config['ISMN_PATH'],
                                  station_id)
    start, end = get_station_start_end(app.config['ISMN_PATH'],
                                       station_id, "soil moisture",
                                       depth_from, depth_to)
    period = [start, end]

    masking_data = {'labels': [], 'data': []}
    masking_meta = get_masking_metadata()
    masking_masked_dict = None
    if len(masking_ids) > 0:
        # prepare masking datasets
        masking_ds_dict = get_masking_ds_dict(masking_ids)
        masking_masked_dict = {}
        for masking_ds, masking_op, masking_value in zip(masking_ids,
                                                         masking_ops,
                                                         masking_values):

            masking_masked_dict[masking_ds] = dict(masking_ds_dict[masking_ds])
            new_cls = MaskingAdapter(masking_masked_dict[masking_ds]['class'],
                                     masking_op,
                                     masking_value)
            masking_masked_dict[masking_ds]['class'] = new_cls

        # use DataManager for reading masking datasets
        masking_dm = DataManager(masking_ds_dict, masking_ids[0],
                                 period=period)
        masking_data = {}
        valid_masking_ids = []
        for mds in masking_ids:
            mdata = masking_dm.read_ds(mds, lon, lat)
            if mdata is not None:
                masking_data[mds] = mdata
                valid_masking_ids.append(mds)
            else:
                masking_data[mds] = pd.DataFrame()
        if len(valid_masking_ids) > 1:
            masking_data = BasicTemporalMatching(window=1.0).combinatory_matcher(
                masking_data, masking_ids[0], n=len(masking_ids))

            if len(masking_data) > 0:
                labels, values = masking_data[
                    masking_data.keys()[0]].to_dygraph_format()
        elif len(valid_masking_ids) == 1:
            masking_data = masking_data[valid_masking_ids[0]]
            labels, values = masking_data.to_dygraph_format()
        else:
            labels = [None]
            values = None

        for i, label in enumerate(labels):
            for mid in masking_meta:
                if masking_meta[mid]['variable']['name'] in label:
                    labels[i] = masking_meta[mid]['long_name']

        masking_data = {'labels': labels, 'data': values}

    ismn_iface = prepare_station_interface(app.config['ISMN_PATH'],
                                           station_id,
                                           "soil moisture",
                                           depth_from, depth_to, sensor_id)

    validation_ds_dict = get_validation_ds_dict()
    validation_ds_dict.update({'ISMN': {'class': ismn_iface,
                                        'columns': ['soil moisture']}})

    if anomaly is not None:
        adapter = {'climatology': AnomalyClimAdapter,
                   'average': AnomalyAdapter}
        for dataset in validation_ds_dict:
            validation_ds_dict[dataset]['class'] = adapter[
                anomaly](validation_ds_dict[dataset]['class'],
                         columns=validation_ds_dict[dataset]['columns'])

    mcalc = BasicMetricsPlusMSE(other_name='k1',
                                calc_tau=True).calc_metrics
    process = Validation(validation_ds_dict, 'ISMN',
                         temporal_ref='cci',
                         scaling=scaling,
                         metrics_calculators={(2, 2): mcalc},
                         masking_datasets=masking_masked_dict,
                         period=period,
                         temporal_window=1)

    df_dict = process.data_manager.get_data(1,
                                            lon,
                                            lat)

    matched_data, result, used_data = process.perform_validation(
        df_dict, (1, lon, lat))

    res_key = list(result)[0]
    data = used_data[res_key]
    result = result[res_key][0]

    # rename data to original names
    rename_dict = {}
    f = lambda x: "k{}".format(x) if x > 0 else 'ref'
    for i, r in enumerate(res_key):
        rename_dict[f(i)] = " ".join(r)

    data.rename(columns=rename_dict, inplace=True)

    labels, values = data.to_dygraph_format()

    validation_datasets = {'labels': labels, 'data': values}

    statistics = {'kendall': {'v': '%.2f' % result['tau'], 'p': '%.4f' % result['p_tau']},
                  'spearman': {'v': '%.2f' % result['rho'], 'p': '%.4f' % result['p_rho']},
                  'pearson': {'v': '%.2f' % result['R'], 'p': '%.4f' % result['p_R']},
                  'bias': '%.4f' % result['BIAS'],
                  'rmsd': {'rmsd': '%.4f' % np.sqrt(result['mse']),
                           'rmsd_corr': '%.4f' % np.sqrt(result['mse_corr']),
                           'rmsd_bias': '%.4f' % np.sqrt(result['mse_bias']),
                           'rmsd_var': '%.4f' % np.sqrt(result['mse_var'])},
                  'mse': {'mse': '%.4f' % result['mse'],
                          'mse_corr': '%.4f' % result['mse_corr'],
                          'mse_bias': '%.4f' % result['mse_bias'],
                          'mse_var': '%.4f' % result['mse_var']}}

    scaling_options = {'noscale': 'No scaling',
                       'porosity': 'Scale using porosity',
                       'linreg': 'Linear Regression',
                       'mean_std': 'Mean - standard deviation',
                       'min_max': 'Minimum,maximum',
                       'lin_cdf_match': 'Piecewise <br> linear CDF matching',
                       'cdf_match': 'CDF matching'}

    if scaling is None:
        scaling = 'noscale'

    masking_option_return = {}
    for mid, mops, mval in zip(masking_ids,
                               masking_ops,
                               masking_values):
        masking_option_return[mid] = {'op': mops,
                                      'val': mval,
                                      'name': masking_meta[mid]['long_name']}

    settings = {'scaling': scaling_options[scaling],
                'masking': masking_option_return}

    output_data = {'validation_data': validation_datasets, 'masking_data': masking_data,
                   'statistics': statistics, 'settings': settings}
    status = 1
    if status == -1:
        data = 'Error'
    else:
        data = jsonify(output_data)

    resp = make_response(data)
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp