def test_combinatory_matcher_n2(): n = 1000 x = np.arange(n) y = np.arange(n) * 0.5 index = pd.date_range(start="2000-01-01", periods=n, freq="D") df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'], index=index) df2 = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'], index=index) df3 = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'], index=index) df_dict = {'data1': df, 'data2': df2, 'data3': df3} temp_matcher = temporal_matchers.BasicTemporalMatching() matched = temp_matcher.combinatory_matcher(df_dict, 'data1') assert sorted(list(matched)) == sorted([('data1', 'data2'), ('data1', 'data3')]) assert sorted(list(matched[('data1', 'data2')].columns)) == sorted([('data1', 'x'), ('data1', 'y'), ('data2', 'x'), ('data2', 'y')]) assert sorted(list(matched[('data1', 'data3')].columns)) == sorted([('data1', 'x'), ('data1', 'y'), ('data3', 'x'), ('data3', 'y')])
def test_validation_n3_k2_temporal_matching_no_matches(): tst_results = {} datasets = setup_two_without_overlap() dm = DataManager( datasets, "DS1", read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]}, ) process = Validation( dm, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, ) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_error_n2_k2(): datasets = setup_TestDatasets() dm = DataManager( datasets, "DS1", read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]}, ) # n less than number of datasets is no longer allowed with pytest.raises(ValueError): Validation( dm, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, )
def test_validation_n3_k2_temporal_matching_no_matches2(): tst_results = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([1000], dtype=np.int32), "tau": np.array([np.nan], dtype=np.float32), "gpi": np.array([4], dtype=np.int32), "RMSD": np.array([0.0], dtype=np.float32), "lon": np.array([4.0]), "p_tau": np.array([np.nan], dtype=np.float32), "BIAS": np.array([0.0], dtype=np.float32), "p_rho": np.array([0.0], dtype=np.float32), "rho": np.array([1.0], dtype=np.float32), "lat": np.array([4.0]), "R": np.array([1.0], dtype=np.float32), "p_R": np.array([0.0], dtype=np.float32), }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([1000], dtype=np.int32), "tau": np.array([np.nan], dtype=np.float32), "gpi": np.array([4], dtype=np.int32), "RMSD": np.array([0.0], dtype=np.float32), "lon": np.array([4.0]), "p_tau": np.array([np.nan], dtype=np.float32), "BIAS": np.array([0.0], dtype=np.float32), "p_rho": np.array([0.0], dtype=np.float32), "rho": np.array([1.0], dtype=np.float32), "lat": np.array([4.0]), "R": np.array([1.0], dtype=np.float32), "p_R": np.array([0.0], dtype=np.float32), }, } datasets = setup_three_with_two_overlapping() dm = DataManager( datasets, "DS1", read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]}, ) process = Validation( dm, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, ) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2(): tst_results = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}} datasets = setup_TestDatasets() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2_masking_no_data_remains(): datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 1000}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read(1) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 0 nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) results = process.calc(*job) tst = [] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def __init__(self, datasets, spatial_ref, metrics_calculators, temporal_matcher=None, temporal_window=1 / 24.0, temporal_ref=None, masking_datasets=None, period=None, scaling='lin_cdf_match', scaling_ref=None): if type(datasets) is DataManager: self.data_manager = datasets else: self.data_manager = DataManager(datasets, spatial_ref, period) self.temp_matching = temporal_matcher if self.temp_matching is None: self.temp_matching = temporal_matchers.BasicTemporalMatching( window=temporal_window).combinatory_matcher self.temporal_ref = temporal_ref if self.temporal_ref is None: self.temporal_ref = self.data_manager.reference_name self.metrics_c = metrics_calculators for n, k in self.metrics_c: if n < len(self.data_manager.datasets.keys()): raise ValueError('n must be equal to the number of datasets') self.masking_dm = None if masking_datasets is not None: # add temporal reference dataset to the masking datasets since it # is necessary for temporally matching the masking datasets to the # common time stamps. Use _reference here to make a clash with the # names of the masking datasets unlikely masking_datasets.update( {'_reference': datasets[self.temporal_ref]}) self.masking_dm = DataManager(masking_datasets, '_reference', period=period) if type(scaling) == str: self.scaling = DefaultScaler(scaling) else: self.scaling = scaling self.scaling_ref = scaling_ref if self.scaling_ref is None: self.scaling_ref = self.data_manager.reference_name self.luts = self.data_manager.get_luts()
def __init__(self, datasets, spatial_ref, metrics_calculators, temporal_matcher=None, temporal_window=1 / 24.0, temporal_ref=None, masking_datasets=None, period=None, scaling='lin_cdf_match', scaling_ref=None): if isinstance(datasets, DataManager): self.data_manager = datasets else: self.data_manager = DataManager(datasets, spatial_ref, period) self.temp_matching = temporal_matcher if self.temp_matching is None: warnings.warn( "You are using the default temporal matcher. If you are using one of the" " newer metric calculators (PairwiseIntercomparisonMetrics," " TripleCollocationMetrics) you should probably use `make_combined_temporal_matcher`" " instead. Have a look at the documentation of the metric calculators for more info." ) self.temp_matching = temporal_matchers.BasicTemporalMatching( window=temporal_window).combinatory_matcher self.temporal_ref = temporal_ref if self.temporal_ref is None: self.temporal_ref = self.data_manager.reference_name self.metrics_c = metrics_calculators for n, k in self.metrics_c: if n < len(self.data_manager.datasets.keys()): raise ValueError('n must be equal to the number of datasets') self.masking_dm = None if masking_datasets is not None: # add temporal reference dataset to the masking datasets since it # is necessary for temporally matching the masking datasets to the # common time stamps. Use _reference here to make a clash with the # names of the masking datasets unlikely masking_datasets.update( {'_reference': datasets[self.temporal_ref]}) self.masking_dm = DataManager(masking_datasets, '_reference', period=period) if type(scaling) == str: self.scaling = DefaultScaler(scaling) else: self.scaling = scaling self.scaling_ref = scaling_ref if self.scaling_ref is None: self.scaling_ref = self.data_manager.reference_name self.luts = self.data_manager.get_luts()
def test_validation_error_n2_k2(): datasets = setup_TestDatasets() dm = DataManager(datasets, 'DS1', read_ts_names={d: 'read' for d in ['DS1', 'DS2', 'DS3']}) # n less than number of datasets is no longer allowed with pytest.raises(ValueError): process = Validation( dm, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics})
def test_validation_n2_k2_temporal_matching_no_matches(): tst_results = {} datasets = setup_two_without_overlap() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2_masking(): # test result for one gpi in a cell tst_results_one = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250], dtype=np.int32)}} # test result for two gpis in a cell tst_results_two = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250, 250], dtype=np.int32)}} # cell 4 in this example has two gpis so it returns different results. tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two} datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 750}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read_ts(1) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 250 nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000)) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) tst = tst_results[len(job[0])] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def test_validation_n3_k2_masking(): # test result for one gpi in a cell tst_results_one = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS1", "x"), ("DS2", "y")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "x")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "y")): { "n_obs": np.array([250], dtype=np.int32) }, } # test result for two gpis in a cell tst_results_two = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS1", "x"), ("DS2", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "x")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, } # cell 4 in this example has two gpis so it returns different results. tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two} datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid( np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]), ) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { "masking1": { "class": mds1, "columns": ["x"], "args": [], "kwargs": {"limit": 500}, "use_lut": False, "grids_compatible": True, }, "masking2": { "class": mds2, "columns": ["x"], "args": [], "kwargs": {"limit": 750}, "use_lut": False, "grids_compatible": True, }, } process = Validation( datasets, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, masking_datasets=mds, ) gpi_info = (1, 1, 1) ref_df = datasets["DS1"]["class"].read(1) with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=DeprecationWarning ) # read_ts is hard coded when using mask_data new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 250 nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): # most warnings here are caused by the read_ts function that cannot # be changed when using a masking data set warnings.simplefilter("ignore", category=DeprecationWarning) results = process.calc(*job) tst = tst_results[len(job[0])] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal( results[key]["n_obs"], tst[tst_key]["n_obs"] )
def test_validation_n3_k2_masking_no_data_remains(): datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid( np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]), ) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { "masking1": { "class": mds1, "columns": ["x"], "args": [], "kwargs": {"limit": 500}, "use_lut": False, "grids_compatible": True, }, "masking2": { "class": mds2, "columns": ["x"], "args": [], "kwargs": {"limit": 1000}, "use_lut": False, "grids_compatible": True, }, } process = Validation( datasets, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, masking_datasets=mds, ) gpi_info = (1, 1, 1) ref_df = datasets["DS1"]["class"].read(1) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 0 nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) results = process.calc(*job) tst = [] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal( results[key]["n_obs"], tst[tst_key]["n_obs"] )
def test_dfdict_combined_temporal_collocation(): ref_dr = pd.date_range("2000", "2020", freq="YS") dr1 = pd.date_range("2000", "2015", freq="YS") dr2 = pd.date_range("2005", "2020", freq="YS") ref_df = pd.DataFrame({"ref": np.arange(len(ref_dr))}, index=ref_dr) df1 = pd.DataFrame({ "k1": np.arange(len(dr1)), "k2": np.arange(len(dr1)) }, index=dr1) df2 = pd.DataFrame({ "k1": np.arange(len(dr2)), "k2": np.arange(len(dr2)) }, index=dr2) dfs = {"refkey": ref_df, "df1key": df1, "df2key": df2} window = pd.Timedelta(days=300) matched = temporal_matchers.dfdict_combined_temporal_collocation( dfs, "refkey", 2, window=window, n=3, combined_dropna=True) # keys are the same, only refkey is missing key = ("refkey", "df1key", "df2key") assert list(matched.keys()) == [key] # overlap is only 11 timestamps assert matched[key].shape == (11, 5) overlap_dr = pd.date_range("2005", "2015", freq="YS") assert np.all(matched[key].index == overlap_dr) # test with ASCAT and ISMN data here = Path(__file__).resolve().parent ascat = pd.read_csv(here / "ASCAT.csv", index_col=0, parse_dates=True) ismn = pd.read_csv(here / "ISMN.csv", index_col=0, parse_dates=True) dfs = {"ASCAT": ascat[["sm"]], "ISMN": ismn[["soil_moisture"]]} refname = "ISMN" window = pd.Timedelta(12, "H") old_matcher = temporal_matchers.BasicTemporalMatching().combinatory_matcher new_matcher = temporal_matchers.make_combined_temporal_matcher(window) expected = old_matcher(dfs, refname, k=2, n=2) new = new_matcher(dfs, refname, k=2, n=2) key = ("ISMN", "ASCAT") assert list(expected.keys()) == [key] assert list(new.keys()) == [key] # We have to do an extra dropna for the old matcher, because the old # matcher doesn't do this by itself. # This is normally done within validation.py, `get_data_for_result_tuple`, # but since the combined matcher should exclude all data where even a # single entry misses (so that all only have common data) this is done # before in the new matcher (the combined matcher, whereas the old one is # the combinatory matcher) exp = expected[key].dropna() assert exp.shape == new[key].shape for col in new[key]: np.testing.assert_equal(exp[col].values, new[key][col].values)