def test_recombination_CRDS(): get_local_bucket(empty=True) filename = "hfd.picarro.1minute.100m.min.dat" filepath = get_datapath(filename=filename, data_type="CRDS") ObsSurface.read_file(filepath, data_type="CRDS", site="hfd", network="DECC") gas_data = parse_crds(data_filepath=filepath, site="HFD", network="AGAGE") ch4_data_read = gas_data["ch4"]["data"] species = "ch4" site = "hfd" inlet = "100m" result = search(species=species, site=site, inlet=inlet) keys = result.keys(site=site, species=species, inlet=inlet) ch4_data_recombined = recombine_datasets(keys=keys) ch4_data_recombined.attrs = {} assert ch4_data_read.time.equals(ch4_data_recombined.time) assert ch4_data_read["ch4"].equals(ch4_data_recombined["ch4"])
def test_delete_Datasource(): bucket = get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="tmb", network="LGHG", sampling_period=60) obs = ObsSurface.load() datasources = obs.datasources() uuid = datasources[0] datasource = Datasource.load(uuid=uuid) data_keys = datasource.data_keys() key = data_keys[0] assert exists(bucket=bucket, key=key) obs.delete(uuid=uuid) assert uuid not in obs.datasources() assert not exists(bucket=bucket, key=key)
def test_read_thames_barrier(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") results = ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="TMB", network="LGHG", sampling_period=3600) expected_keys = sorted(["CH4", "CO2", "CO"]) assert sorted(list(results["processed"] ["thames_test_20190707.csv"].keys())) == expected_keys uuid = results["processed"]["thames_test_20190707.csv"]["CO2"] data = Datasource.load(uuid=uuid, shallow=False).data() data = data["2019-07-01-00:39:55+00:00_2019-08-01-00:10:30+00:00"] assert data.time[0] == Timestamp("2019-07-01T00:39:55") assert data.time[-1] == Timestamp("2019-08-01T00:10:30") assert data["co2"][0] == pytest.approx(417.97344761) assert data["co2"][-1] == pytest.approx(417.80000653) assert data["co2_variability"][0] == 0 assert data["co2_variability"][-1] == 0 obs = ObsSurface.load() assert sorted(obs._datasource_uuids.values()) == expected_keys
def data_read(): ''' Data set up for running tests for these sets of modules. ''' get_local_bucket(empty=True) # Files for creating forward model (mf_mod) for methane at TAC site # Observation data # - TAC at 100m for 201208 site = "tac" network = "DECC" data_type = "CRDS" tac_path = get_datapath(filename="tac.picarro.1minute.100m.201208.dat", data_type="CRDS") ObsSurface.read_file(filepath=tac_path, data_type=data_type, site=site, network=network) # Emissions data # Anthropogenic ch4 (methane) data from 2012 for EUROPE species = "ch4" source = "anthro" domain = "EUROPE" emissions_datapath = get_emissions_datapath("ch4-anthro_EUROPE_2012.nc") Emissions.read_file( filepath=emissions_datapath, species=species, source=source, date="2012", domain=domain, high_time_resolution=False, ) # Footprint data # TAC footprint from 2012-08 - 2012-09 at 100m height = "100m" model = "NAME" fp_datapath = get_footprint_datapath("TAC-100magl_EUROPE_201208.nc") Footprints.read_file(filepath=fp_datapath, site=site, model=model, network=network, height=height, domain=domain)
def _get_sources_local(self, site: str, species: str) -> Dict: site = verify_site(site=site) # Save these self.site = site self.species = species obs = ObsSurface.load() datasource_uuids = obs.datasources() rank_table = obs.rank_data() # Shallow load the Datasources (only get their JSON metadata) datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) matching_sources = [d for d in datasources if d.search_metadata(site=site, species=species)] if not matching_sources: return {} self._user_info = { d.inlet(): { "rank_data": rank_table.get(d.uuid(), "NA"), "data_range": d.daterange_str(), } for d in matching_sources } self._key_lookup = {d.inlet(): d.uuid() for d in matching_sources} self._needs_update = False return self._user_info
def query_store() -> Dict: """Create a dictionary that can be used to visualise the object store Returns: dict: Dictionary for data to be shown in force graph """ from openghg.store.base import Datasource from openghg.store import ObsSurface obs = ObsSurface.load() datasource_uuids = obs.datasources() datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) data = {} for d in datasources: metadata = d.metadata() result = { "site": metadata["site"], "species": metadata["species"], "instrument": metadata.get("instrument", "Unknown"), "network": metadata.get("network", "Unknown"), "inlet": metadata.get("inlet", "Unknown"), } data[d.uuid()] = result return data
def get_sources(args: Dict) -> Dict: obs = ObsSurface.load() datasource_uuids = obs.datasources() rank_table = obs.rank_data() site = args["site"] species = args["species"] # Shallow load the Datasources (only get their JSON metadata) datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) matching_sources = [ d for d in datasources if d.search_metadata(site=site, species=species) ] if not matching_sources: return {} user_info = { d.inlet(): { "rank_data": rank_table.get(d.uuid(), "NA"), "data_range": d.daterange_str(), } for d in matching_sources } key_lookup = {d.inlet(): d.uuid() for d in matching_sources} return {"user_info": user_info, "key_lookup": key_lookup}
def test_read_noaa_obspack(): data_filepath = get_datapath( filename="ch4_esp_surface-flask_2_representative.nc", data_type="NOAA") results = ObsSurface.read_file(filepath=data_filepath, inlet="flask", data_type="NOAA", site="esp", network="NOAA", overwrite=True) uuid = results["processed"]["ch4_esp_surface-flask_2_representative.nc"][ "ch4"] ch4_data = Datasource.load(uuid=uuid, shallow=False).data() assert sorted(list(ch4_data.keys())) == [ "1993-06-17-00:12:30+00:00_1993-11-20-21:50:00+00:00", "1994-01-02-22:10:00+00:00_1994-12-24-22:15:00+00:00", "1995-02-06-12:00:00+00:00_1995-11-08-19:55:00+00:00", "1996-01-21-22:10:00+00:00_1996-12-01-20:00:00+00:00", "1997-02-12-19:00:00+00:00_1997-12-20-20:15:00+00:00", "1998-01-01-23:10:00+00:00_1998-12-31-19:50:00+00:00", "1999-01-14-22:15:00+00:00_1999-12-31-23:35:00+00:00", "2000-03-05-00:00:00+00:00_2000-11-04-22:30:00+00:00", "2001-01-05-21:45:00+00:00_2001-12-06-12:00:00+00:00", "2002-01-12-12:00:00+00:00_2002-01-12-12:00:00+00:00", ] data = ch4_data["1998-01-01-23:10:00+00:00_1998-12-31-19:50:00+00:00"] assert data.time[0] == Timestamp("1998-01-01T23:10:00") assert data["ch4"][0] == pytest.approx(1.83337e-06) assert data["ch4_number_of_observations"][0] == 2.0 assert data["ch4_variability"][0] == pytest.approx(2.093036e-09)
def test_read_cranfield(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="THB_hourly_means_test.csv", data_type="Cranfield_CRDS") results = ObsSurface.read_file(filepath=data_filepath, data_type="CRANFIELD", site="THB", network="CRANFIELD") expected_keys = ["ch4", "co", "co2"] assert sorted(results["processed"] ["THB_hourly_means_test.csv"].keys()) == expected_keys uuid = results["processed"]["THB_hourly_means_test.csv"]["ch4"] ch4_data = Datasource.load(uuid=uuid, shallow=False).data() ch4_data = ch4_data["2018-05-05-00:00:00+00:00_2018-05-13-16:00:00+00:00"] assert ch4_data.time[0] == Timestamp("2018-05-05") assert ch4_data.time[-1] == Timestamp("2018-05-13T16:00:00") assert ch4_data["ch4"][0] == pytest.approx(2585.651) assert ch4_data["ch4"][-1] == pytest.approx(1999.018) assert ch4_data["ch4 variability"][0] == pytest.approx(75.50218) assert ch4_data["ch4 variability"][-1] == pytest.approx(6.48413)
def test_add_new_data_correct_datasource(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="capegrim-medusa.05.C", data_type="GC") precision_filepath = get_datapath( filename="capegrim-medusa.05.precisions.C", data_type="GC") results = ObsSurface.read_file(filepath=(data_filepath, precision_filepath), data_type="GCWERKS", site="CGO", network="AGAGE") first_results = results["processed"]["capegrim-medusa.05.C"] sorted_keys = sorted( list(results["processed"]["capegrim-medusa.05.C"].keys())) assert sorted_keys[:4] == [ 'c2cl4_10m', 'c2cl4_70m', 'c2f6_10m', 'c2f6_70m' ] assert sorted_keys[-4:] == [ 'hfc32_70m', 'sf6_70m', 'so2f2_10m', 'so2f2_70m' ] assert len(sorted_keys) == 69 data_filepath = get_datapath(filename="capegrim-medusa.06.C", data_type="GC") precision_filepath = get_datapath( filename="capegrim-medusa.06.precisions.C", data_type="GC") new_results = ObsSurface.read_file(filepath=(data_filepath, precision_filepath), data_type="GCWERKS", site="CGO", network="AGAGE") second_results = new_results["processed"]["capegrim-medusa.06.C"] shared_keys = [key for key in first_results if key in second_results] assert len(shared_keys) == 67 for key in shared_keys: assert first_results[key] == second_results[key]
def test_set_rank_overwrite(): o = ObsSurface.load() o._rank_data.clear() test_uid = "test-uid-123" daterange_str = create_daterange_str(start="2007-01-01", end="2015-01-01") o.set_rank(uuid=test_uid, rank=1, date_range=daterange_str) assert o._rank_data["test-uid-123"] == { "2007-01-01-00:00:00+00:00_2015-01-01-00:00:00+00:00": 1 } daterange_str = create_daterange_str(start="2008-01-01", end="2009-01-01") o.set_rank(uuid=test_uid, rank=2, date_range=daterange_str, overwrite=True) expected_ranking = { "2007-01-01-00:00:00+00:00_2007-12-31-23:59:59+00:00": 1, "2008-01-01-00:00:00+00:00_2008-12-31-23:59:59+00:00": 2, "2009-01-01-00:00:01+00:00_2015-01-01-00:00:00+00:00": 1, } assert o._rank_data["test-uid-123"] == expected_ranking daterange_str = create_daterange_str(start="1994-01-01", end="2023-01-01") o.set_rank(uuid=test_uid, rank=2, date_range=daterange_str, overwrite=True) assert o._rank_data["test-uid-123"] == { "1994-01-01-00:00:00+00:00_2023-01-01-00:00:00+00:00": 2 } o._rank_data.clear() daterange_str = create_daterange_str(start="2001-01-01", end="2021-01-01") o.set_rank(uuid=test_uid, rank=1, date_range=daterange_str) assert o._rank_data["test-uid-123"] == { "2001-01-01-00:00:00+00:00_2021-01-01-00:00:00+00:00": 1 } daterange_str = create_daterange_str(start="2007-01-01", end="2009-01-01") o.set_rank(uuid=test_uid, rank=2, date_range=daterange_str, overwrite=True) daterange_str = create_daterange_str(start="2015-01-01", end="2016-01-01") o.set_rank(uuid=test_uid, rank=2, date_range=daterange_str, overwrite=True) expected = { "2001-01-01-00:00:00+00:00_2006-12-31-23:59:59+00:00": 1, "2007-01-01-00:00:00+00:00_2008-12-31-23:59:59+00:00": 2, "2009-01-01-00:00:01+00:00_2014-12-31-23:59:59+00:00": 1, "2015-01-01-00:00:00+00:00_2015-12-31-23:59:59+00:00": 2, "2016-01-01-00:00:01+00:00_2021-01-01-00:00:00+00:00": 1, } assert o._rank_data["test-uid-123"] == expected
def co2_setup(): get_local_bucket(empty=True) data_type = "CRDS" tac_file = get_datapath(filename="tac.picarro.hourly.100m.test.dat", data_type=data_type) tac_footprint = get_fp_datapath("TAC-100magl_UKV_co2_TEST_201407.nc") co2_emissions = get_flux_datapath("co2-rtot-cardamom-2hr_TEST_2014.nc") site = "tac" species = "co2" network = "DECC" height = "100m" domain = "TEST" model = "NAME" metmodel = "UKV" source = "rtot-cardamom" date = "2014" ObsSurface.read_file(filepath=tac_file, data_type=data_type, site=site, network=network, inlet=height) Footprints.read_file(filepath=tac_footprint, site=site, height=height, domain=domain, model=model, metmodel=metmodel, species=species) Emissions.read_file(filepath=co2_emissions, species=species, source=source, domain=domain, date=date, high_time_resolution=True)
def test_upload_same_file_twice_raises(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="tmb", network="LGHG", sampling_period=60) # assert not res["error"] with pytest.raises(ValueError): ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="tmb", network="LGHG", sampling_period=60)
def _process_files_local( files: filepathType, data_type: str, site: str, network: str, inlet: str = None, instrument: str = None, overwrite: bool = False, ) -> Dict: """Process the passed file(s) Args: files: Path of files to be processed data_type: Type of data to be processed (CRDS, GC etc) site: Site code or name network: Network name instrument: Instrument name overwrite: Should this data overwrite data stored for these datasources for existing dateranges Returns: dict: UUIDs of Datasources storing data of processed files keyed by filename """ data_type = DataTypes[data_type.upper()].name if not isinstance(files, list): files = [files] obs = ObsSurface.load() results = {} # Ensure we have Paths # TODO: Delete this, as we already have the same warning in read_file? if data_type == "GCWERKS": if not all(isinstance(item, tuple) for item in files): raise TypeError( "If data type is GC, a list of tuples for data and precision filenames must be passed" ) files = [(Path(f), Path(p)) for f, p in files] else: files = [Path(f) for f in files] r = obs.read_file( filepath=files, data_type=data_type, site=site, network=network, instrument=instrument, inlet=inlet, overwrite=overwrite, ) results.update(r) return results
def set_rank(args: Dict) -> None: obs = ObsSurface.load() rank = args["rank"] uuid = args["uuid"] dateranges = args["dateranges"] overwrite = args["overwrite"] obs.set_rank(uuid=uuid, rank=rank, date_range=dateranges, overwrite=overwrite)
def test_recombination_GC(): get_local_bucket(empty=True) data = get_datapath(filename="capegrim-medusa.18.C", data_type="GC") precision = get_datapath(filename="capegrim-medusa.18.precisions.C", data_type="GC") ObsSurface.read_file((data, precision), data_type="GCWERKS", site="cgo", network="agage") data = parse_gcwerks(data_filepath=data, precision_filepath=precision, site="CGO", instrument="medusa", network="AGAGE") toluene_data = data["c6h5ch3_70m"]["data"] species = "c6h5ch3" site = "CGO" inlet = "70m" result = search(species=species, site=site, inlet=inlet) keys = result.keys(site=site, species=species, inlet=inlet) toluene_data_recombined = recombine_datasets(keys=keys) toluene_data.attrs = {} toluene_data_recombined.attrs = {} assert toluene_data.time.equals(toluene_data_recombined.time) assert toluene_data["c6h5ch3"].equals(toluene_data_recombined["c6h5ch3"]) assert toluene_data["c6h5ch3_repeatability"].equals( toluene_data_recombined["c6h5ch3_repeatability"]) assert toluene_data["c6h5ch3_status_flag"].equals( toluene_data_recombined["c6h5ch3_status_flag"]) assert toluene_data["c6h5ch3_integration_flag"].equals( toluene_data_recombined["c6h5ch3_integration_flag"])
def _clear_rank_local(self, inlet: str) -> None: """Clear the ranking data for a Datasource Args: key: Key for specific source Returns: None """ obs = ObsSurface.load() inlet = inlet.lower() uuid = self._key_lookup[inlet] obs.clear_rank(uuid=uuid) self._needs_update = True
def test_rank_overlapping_dateranges(): dateranges = [ "2014-01-01_2099-06-06", "2014-06-07_2015-09-09", "2015-09-10_2019-01-06" ] o = ObsSurface.load() o._rank_data.clear() test_uid = "test-uid-123" o.set_rank(uuid=test_uid, rank=1, date_range=dateranges) with pytest.raises(ValueError): o.set_rank(uuid=test_uid, rank=2, date_range=dateranges)
def _set_rank_local( self, inlet: str, rank: Union[int, str], start_date: str, end_date: str, overwrite: bool = False, ) -> None: obs = ObsSurface.load() inlet = inlet.lower() uuid = self._key_lookup[inlet] daterange = create_daterange_str(start=start_date, end=end_date) obs.set_rank(uuid=uuid, rank=rank, date_range=daterange, overwrite=overwrite) self._needs_update = True
def test_read_beaco2n(): data_filepath = get_datapath(filename="Charlton_Community_Center.csv", data_type="BEACO2N") results = ObsSurface.read_file(filepath=data_filepath, data_type="BEACO2N", site="CCC", network="BEACO2N", overwrite=True) uuid = results["processed"]["Charlton_Community_Center.csv"]["co2"] co2_data = Datasource.load(uuid=uuid, shallow=False).data() co2_data = co2_data["2015-04-18-04:00:00+00:00_2015-04-18-10:00:00+00:00"] assert co2_data.time[0] == Timestamp("2015-04-18T04:00:00") assert co2_data["co2"][0] == 410.4 assert co2_data["co2_qc"][0] == 2
def test_rank_same_daterange_doesnt_change(): o = ObsSurface.load() o._rank_data.clear() test_uid = "test-uid-123" o.set_rank(uuid=test_uid, rank=1, date_range="2012-01-01_2012-06-01") assert o._rank_data == { "test-uid-123": { "2012-01-01-00:00:00+00:00_2012-06-01-00:00:00+00:00": 1 } } o.set_rank(uuid=test_uid, rank=1, date_range="2012-01-01_2012-06-01") assert o._rank_data == { "test-uid-123": { "2012-01-01-00:00:00+00:00_2012-06-01-00:00:00+00:00": 1 } }
def test_read_multiside_aqmesh(): datafile = get_datapath(filename="co2_data.csv", data_type="AQMESH") metafile = get_datapath(filename="co2_metadata.csv", data_type="AQMESH") result = ObsSurface.read_multisite_aqmesh(data_filepath=datafile, metadata_filepath=metafile, overwrite=True) # This crazy structure will be fixed when add_datsources is updated raith_uuid = result["raith"]["raith"] d = Datasource.load(uuid=raith_uuid, shallow=False) data = d.data()["2021-06-18-05:00:00+00:00_2021-06-21-13:00:00+00:00"] data.time[0] == Timestamp("2021-06-18T05:00:00") data.co2[0] == 442.64 data.time[-1] == Timestamp("2021-06-21T13:00:00") data.co2[-1] == 404.84 expected_attrs = { "site": "raith", "pod_id": 39245, "start_date": "2021-06-15 01:00:00", "end_date": "2021-10-04 00:59:00", "relocate_date": "NA", "long_name": "Raith", "borough": "Glasgow", "site_type": "Roadside", "in_ulez": "No", "latitude": 55.798813, "longitude": -4.058363, "inlet": 1, "network": "aqmesh_glasgow", "sampling_period": "NOT_SET", "species": "co2", "units": "ppm", } assert data.attrs == expected_attrs
def test_set_rank(): o = ObsSurface.load() o._rank_data.clear() test_uid = "test-uid-123" daterange_str = create_daterange_str(start="2001-01-01", end="2005-01-01") o.set_rank(uuid=test_uid, rank=1, date_range=daterange_str) assert o._rank_data == { "test-uid-123": { "2001-01-01-00:00:00+00:00_2005-01-01-00:00:00+00:00": 1 } } daterange_str = create_daterange_str(start="2007-01-01", end="2009-01-01") o.set_rank(uuid=test_uid, rank=1, date_range=daterange_str) assert o._rank_data["test-uid-123"] == { "2001-01-01-00:00:00+00:00_2005-01-01-00:00:00+00:00": 1, "2007-01-01-00:00:00+00:00_2009-01-01-00:00:00+00:00": 1, } # Make sure we can't set another rank for the same daterange with pytest.raises(ValueError): o.set_rank(uuid=test_uid, rank=2, date_range=daterange_str) daterange_str = create_daterange_str(start="2008-01-01", end="2009-01-01") with pytest.raises(ValueError): o.set_rank(uuid=test_uid, rank=3, date_range=daterange_str) daterange_str = create_daterange_str(start="2007-01-01", end="2015-01-01") o.set_rank(uuid=test_uid, rank=1, date_range=daterange_str) assert o._rank_data["test-uid-123"] == { "2001-01-01-00:00:00+00:00_2005-01-01-00:00:00+00:00": 1, "2007-01-01-00:00:00+00:00_2015-01-01-00:00:00+00:00": 1, }
def test_rank_daterange_start_overlap_overwrite(): o = ObsSurface.load() o._rank_data.clear() test_uid = "test-uid-123" o.set_rank(uuid=test_uid, rank=1, date_range="2012-01-01_2013-01-01") assert o._rank_data == { "test-uid-123": { "2012-01-01-00:00:00+00:00_2013-01-01-00:00:00+00:00": 1 } } o.set_rank(uuid=test_uid, rank=2, date_range="2012-01-01_2012-06-01", overwrite=True) assert o._rank_data == { "test-uid-123": { "2012-06-01-00:00:01+00:00_2013-01-01-00:00:00+00:00": 1, "2012-01-01-00:00:00+00:00_2012-06-01-00:00:00+00:00": 2, } } o.set_rank(uuid=test_uid, rank=1, date_range="2012-01-01_2013-01-01", overwrite=True) expected = { "test-uid-123": { "2012-01-01-00:00:00+00:00_2013-01-01-00:00:00+00:00": 1 } } assert o._rank_data == expected
def test_read_noaa_raw(): get_local_bucket(empty=True) data_filepath = get_datapath( filename="co_pocn25_surface-flask_1_ccgg_event.txt", data_type="NOAA") results = ObsSurface.read_file(filepath=data_filepath, data_type="NOAA", site="POCN25", network="NOAA", inlet="flask") uuid = results["processed"]["co_pocn25_surface-flask_1_ccgg_event.txt"][ "co"] co_data = Datasource.load(uuid=uuid, shallow=False).data() assert sorted(list(co_data.keys())) == [ "1990-06-29-05:00:00+00:00_1990-07-10-21:28:00+00:00", "2009-06-13-16:32:00+00:00_2009-12-03-00:30:00+00:00", "2010-01-10-00:13:00+00:00_2010-12-09-16:05:00+00:00", "2011-01-27-04:55:00+00:00_2011-11-11-14:45:00+00:00", "2016-12-03-12:37:00+00:00_2016-12-18-05:30:00+00:00", "2017-01-27-19:10:00+00:00_2017-07-15-04:15:00+00:00", ] co_data = co_data["1990-06-29-05:00:00+00:00_1990-07-10-21:28:00+00:00"] assert co_data["co"][0] == pytest.approx(94.9) assert co_data["co"][-1] == pytest.approx(95.65) assert co_data["co_repeatability"][0] == pytest.approx(-999.99) assert co_data["co_repeatability"][-1] == pytest.approx(-999.99) assert co_data["co_selection_flag"][0] == 0 assert co_data["co_selection_flag"][-1] == 0
def load_CRDS(): get_local_bucket(empty=True) tac_100m = get_datapath("tac.picarro.1minute.100m.min.dat", data_type="CRDS") hfd_50m = get_datapath("hfd.picarro.1minute.50m.min.dat", data_type="CRDS") bsd_42m = get_datapath("bsd.picarro.1minute.42m.min.dat", data_type="CRDS") bsd_108m = get_datapath("bsd.picarro.1minute.108m.min.dat", data_type="CRDS") bsd_248m = get_datapath("bsd.picarro.1minute.248m.min.dat", data_type="CRDS") ObsSurface.read_file(filepath=tac_100m, data_type="CRDS", site="tac", network="DECC") ObsSurface.read_file(filepath=hfd_50m, data_type="CRDS", site="hfd", network="DECC") ObsSurface.read_file(filepath=[bsd_42m, bsd_108m, bsd_248m], data_type="CRDS", site="bsd", network="DECC")
def search(**kwargs): # type: ignore """Search for observations data. Any keyword arguments may be passed to the the function and these keywords will be used to search the metadata associated with each Datasource. Example / commonly used arguments are given below. Args: species: Terms to search for in Datasources locations: Where to search for the terms in species inlet: Inlet height such as 100m instrument: Instrument name such as picarro find_all: Require all search terms to be satisfied start_date: Start datetime for search. If None a start datetime of UNIX epoch (1970-01-01) is set end_date: End datetime for search. If None an end datetime of the current datetime is set skip_ranking: If True skip ranking system, defaults to False Returns: dict: List of keys of Datasources matching the search parameters """ from addict import Dict as aDict from copy import deepcopy from itertools import chain as iter_chain from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel from openghg.store.base import Datasource from openghg.util import ( timestamp_now, timestamp_epoch, timestamp_tzaware, clean_string, closest_daterange, find_daterange_gaps, split_daterange_str, load_json, ) from openghg.dataobjects import SearchResults # Get a copy of kwargs as we make some modifications below kwargs_copy = deepcopy(kwargs) # Do this here otherwise we have to produce them for every datasource start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") if start_date is None: start_date = timestamp_epoch() else: start_date = timestamp_tzaware(start_date) if end_date is None: end_date = timestamp_now() else: end_date = timestamp_tzaware(end_date) kwargs_copy["start_date"] = start_date kwargs_copy["end_date"] = end_date skip_ranking = kwargs_copy.get("skip_ranking", False) try: del kwargs_copy["skip_ranking"] except KeyError: pass # As we might have kwargs that are None we want to get rid of those search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None} # Speices translation species = search_kwargs.get("species") if species is not None: if not isinstance(species, list): species = [species] translator = load_json("species_translator.json") updated_species = [] for s in species: updated_species.append(s) try: translated = translator[s] except KeyError: pass else: updated_species.extend(translated) search_kwargs["species"] = updated_species data_type = search_kwargs.get("data_type", "timeseries") valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model") if data_type not in valid_data_types: raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}") # Assume we want timeseries data obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load() if data_type == "footprints": obj = Footprints.load() elif data_type == "emissions": obj = Emissions.load() elif data_type == "eulerian_model": obj = EulerianModel.load() datasource_uuids = obj.datasources() # Shallow load the Datasources so we can search their metadata datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) # For the time being this will return a dict until we know how best to represent # the footprints and emissions results in a SearchResult object if data_type in {"emissions", "footprints", "eulerian_model"}: sources: Dict = aDict() for datasource in datasources: if datasource.search_metadata(**search_kwargs): uid = datasource.uuid() sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) sources[uid]["metadata"] = datasource.metadata() return sources # Find the Datasources that contain matching metadata matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)} # TODO - Update this as it only uses the ACRG repo JSON at the moment # Check if this site only has one inlet, if so skip ranking # if "site" in search_kwargs: # site = search_kwargs["site"] # if not isinstance(site, list) and not multiple_inlets(site=site): # skip_ranking = True # If there isn't *any* ranking data at all, skip all the ranking functionality if not obj._rank_data: skip_ranking = True # If only one datasource has been returned, skip all the ranking functionality if len(matching_sources) == 1: skip_ranking = True # If we have the site, inlet and instrument then just return the data # TODO - should instrument be added here if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True: specific_sources = aDict() for datasource in matching_sources.values(): specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) if not specific_keys: continue metadata = datasource.metadata() site = metadata["site"] species = metadata["species"] inlet = metadata["inlet"] specific_sources[site][species][inlet]["keys"] = specific_keys specific_sources[site][species][inlet]["metadata"] = metadata return SearchResults(results=specific_sources.to_dict(), ranked_data=False) highest_ranked = aDict() for uid, datasource in matching_sources.items(): # Find the site and then the ranking metadata = datasource.metadata() # Get the site inlet and species site = metadata["site"] species = metadata["species"] rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date) # If this Datasource doesn't have any ranking data skip it and move on if not rank_data: continue # There will only be a single rank key rank_value = next(iter(rank_data)) # Get the daterange this rank covers rank_dateranges = rank_data[rank_value] # Each match we store gives us the information we need # to retrieve the data match = {"uuid": uid, "dateranges": rank_dateranges} # Need to ensure we get all the dates covered if species in highest_ranked[site]: species_rank_data = highest_ranked[site][species] # If we have a higher (lower number) rank save it if rank_value < species_rank_data["rank"]: species_rank_data["rank"] = rank_value species_rank_data["matching"] = [match] # If another Datasource has the same rank for another daterange # we want to save that as well elif rank_value == species_rank_data["rank"]: species_rank_data["matching"].append(match) else: highest_ranked[site][species]["rank"] = rank_value highest_ranked[site][species]["matching"] = [match] if not highest_ranked: raise ValueError( ( "No ranking data set for the given search parameters." " Please refine your search to include a specific site, species and inlet." ) ) # Now we have the highest ranked data the dateranges there are ranks for # we want to fill in the gaps with (currently) the highest inlet from that site # We just want some rank_metadata to go along with the final data scheme # Can key a key of date - inlet data_keys: Dict = aDict() for site, species in highest_ranked.items(): for sp, data in species.items(): # data_keys[site][sp]["keys"] = [] species_keys = [] species_rank_data = {} species_metadata = {} for match_data in data["matching"]: uuid = match_data["uuid"] match_dateranges = match_data["dateranges"] # Get the datasource as it's already in the dictionary # we created earlier datasource = matching_sources[uuid] metadata = datasource.metadata() inlet = metadata["inlet"] keys = [] for dr in match_dateranges: date_keys = datasource.keys_in_daterange_str(daterange=dr) if date_keys: keys.extend(date_keys) # We'll add this to the metadata in the search results we return at the end species_rank_data[dr] = inlet species_keys.extend(keys) species_metadata[inlet] = metadata # Only create the dictionary keys if we have some data keys if species_keys: data_keys[site][sp]["keys"] = species_keys data_keys[site][sp]["rank_metadata"] = species_rank_data data_keys[site][sp]["metadata"] = species_metadata else: continue # We now need to retrieve data for the dateranges for which we don't have ranking data # To do this find the gaps in the daterange over which the user has requested data # and the dates for which we have ranking information # Get the dateranges that are covered by ranking information daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]])) # Find the gaps in the ranking coverage gap_dateranges = find_daterange_gaps( start_search=start_date, end_search=end_date, dateranges=daterange_strs ) # We want the dateranges and inlets for those dateranges inlet_dateranges = data_keys[site][sp]["rank_metadata"] # These are the dateranges for which we have ranking information for this site and species ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys()) for gap_daterange in gap_dateranges: # We want to select the inlet that's ranked for dates closest to the ones we have here closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges) gap_start, gap_end = split_daterange_str(gap_daterange) # Find the closest ranked inlet by date chosen_inlet = inlet_dateranges[closest_dr] inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet] inlet_instrument = inlet_metadata["instrument"] inlet_sampling_period = inlet_metadata["sampling_period"] # Then we want to retrieve the correct metadata for those inlets results: SearchResults = search( site=site, species=sp, inlet=chosen_inlet, instrument=inlet_instrument, sampling_period=inlet_sampling_period, start_date=gap_start, end_date=gap_end, ) # type: ignore if not results: continue # Retrieve the data keys inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet) data_keys[site][sp]["keys"].extend(inlet_data_keys) # Remove any duplicate keys data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"])) # TODO - create a stub for addict dict_data_keys = data_keys.to_dict() # type: ignore return SearchResults(results=dict_data_keys, ranked_data=True)
def test_read_CRDS(): get_local_bucket(empty=True) filepath = get_datapath(filename="bsd.picarro.1minute.248m.min.dat", data_type="CRDS") results = ObsSurface.read_file(filepath=filepath, data_type="CRDS", site="bsd", network="DECC") keys = results["processed"]["bsd.picarro.1minute.248m.min.dat"].keys() assert sorted(keys) == ["ch4", "co", "co2"] # Load up the assigned Datasources and check they contain the correct data data = results["processed"]["bsd.picarro.1minute.248m.min.dat"] ch4_data = Datasource.load(uuid=data["ch4"]).data() ch4_data = ch4_data["2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00"] assert ch4_data.time[0] == Timestamp("2014-01-30T11:12:30") assert ch4_data["ch4"][0] == 1959.55 assert ch4_data["ch4"][-1] == 1962.8 assert ch4_data["ch4_variability"][-1] == 1.034 assert ch4_data["ch4_number_of_observations"][-1] == 26.0 obs = ObsSurface.load() uuid_one = obs.datasources()[0] datasource = Datasource.load(uuid=uuid_one) data_keys = list(datasource.data().keys()) expected_keys = [ "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", ] assert data_keys == expected_keys filepath = get_datapath(filename="bsd.picarro.1minute.248m.future.dat", data_type="CRDS") results = ObsSurface.read_file(filepath=filepath, data_type="CRDS", site="bsd", network="DECC") uuid_one = obs.datasources()[0] datasource = Datasource.load(uuid=uuid_one) data_keys = sorted(list(datasource.data().keys())) new_expected_keys = [ "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", "2023-01-30-13:56:30+00:00_2023-01-30-14:20:30+00:00", ] assert data_keys == new_expected_keys table = obs._datasource_table assert table["bsd"]["decc"]["ch4"]["248m"] assert table["bsd"]["decc"]["co2"]["248m"] assert table["bsd"]["decc"]["co"]["248m"]
def test_read_GC(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="capegrim-medusa.18.C", data_type="GC") precision_filepath = get_datapath( filename="capegrim-medusa.18.precisions.C", data_type="GC") results = ObsSurface.read_file(filepath=(data_filepath, precision_filepath), data_type="GCWERKS", site="CGO", network="AGAGE") # 30/11/2021: Species labels were updated to be standardised in line with variable naming # This list of expected labels was updated. expected_keys = [ 'c2cl4_70m', 'c2f6_70m', 'c2h2_70m', 'c2h6_70m', 'c2hcl3_70m', 'c3f8_70m', 'c3h8_70m', 'c4f10_70m', 'c4f8_70m', 'c6f14_70m', 'c6h5ch3_70m', 'c6h6_70m', 'cc3h8_70m', 'ccl4_70m', 'cf4_70m', 'cfc112_70m', 'cfc113_70m', 'cfc114_70m', 'cfc115_70m', 'cfc11_70m', 'cfc12_70m', 'cfc13_70m', 'ch2br2_70m', 'ch2cl2_70m', 'ch3br_70m', 'ch3ccl3_70m', 'ch3cl_70m', 'ch3i_70m', 'chbr3_70m', 'chcl3_70m', 'cos_70m', 'desflurane_70m', 'halon1211_70m', 'halon1301_70m', 'halon2402_70m', 'hcfc124_70m', 'hcfc132b_70m', 'hcfc133a_70m', 'hcfc141b_70m', 'hcfc142b_70m', 'hcfc22_70m', 'hfc125_70m', 'hfc134a_70m', 'hfc143a_70m', 'hfc152a_70m', 'hfc227ea_70m', 'hfc236fa_70m', 'hfc23_70m', 'hfc245fa_70m', 'hfc32_70m', 'hfc365mfc_70m', 'hfc4310mee_70m', 'nf3_70m', 'sf5cf3_70m', 'sf6_70m', 'so2f2_70m' ] assert sorted(list( results["processed"]["capegrim-medusa.18.C"].keys())) == expected_keys # Load in some data uuid = results["processed"]["capegrim-medusa.18.C"]["hfc152a_70m"] hfc152a_data = Datasource.load(uuid=uuid, shallow=False).data() hfc152a_data = hfc152a_data[ "2018-01-01-02:24:00+00:00_2018-01-31-23:33:00+00:00"] assert hfc152a_data.time[0] == Timestamp("2018-01-01T02:24:00") assert hfc152a_data.time[-1] == Timestamp("2018-01-31T23:33:00") assert hfc152a_data["hfc152a"][0] == 4.409 assert hfc152a_data["hfc152a"][-1] == 4.262 assert hfc152a_data["hfc152a_repeatability"][0] == 0.03557 assert hfc152a_data["hfc152a_repeatability"][-1] == 0.03271 assert hfc152a_data["hfc152a_status_flag"][0] == 0 assert hfc152a_data["hfc152a_status_flag"][-1] == 0 assert hfc152a_data["hfc152a_integration_flag"][0] == 0 assert hfc152a_data["hfc152a_integration_flag"][-1] == 0 # Check we have the Datasource info saved obs = ObsSurface.load() assert sorted(obs._datasource_uuids.values()) == expected_keys attrs = hfc152a_data.attrs assert attributes_checker_obssurface(attrs=attrs, species="hfc152a") # # Now test that if we add more data it adds it to the same Datasource uuid_one = obs.datasources()[0] datasource = Datasource.load(uuid=uuid_one) data_one = datasource.data() assert list(data_one.keys()) == [ "2018-01-01-02:24:00+00:00_2018-01-31-23:33:00+00:00" ] data_filepath = get_datapath(filename="capegrim-medusa.future.C", data_type="GC") precision_filepath = get_datapath( filename="capegrim-medusa.future.precisions.C", data_type="GC") results = ObsSurface.read_file(filepath=(data_filepath, precision_filepath), data_type="GCWERKS", site="CGO", network="AGAGE") datasource = Datasource.load(uuid=uuid_one) data_one = datasource.data() assert sorted(list(data_one.keys())) == [ "2018-01-01-02:24:00+00:00_2018-01-31-23:33:00+00:00", "2023-01-01-02:24:00+00:00_2023-01-31-23:33:00+00:00", ] data_filepath = get_datapath(filename="trinidadhead.01.C", data_type="GC") precision_filepath = get_datapath(filename="trinidadhead.01.precisions.C", data_type="GC") ObsSurface.read_file( filepath=(data_filepath, precision_filepath), data_type="GCWERKS", site="THD", instrument="gcmd", network="AGAGE", ) obs = ObsSurface.load() table = obs._datasource_table assert table["cgo"]["agage"]["nf3"]["70m"] assert table["cgo"]["agage"]["hfc236fa"]["70m"] assert table["cgo"]["agage"]["halon1211"]["70m"] assert table["thd"]["agage"]["cfc11"]["10m"] assert table["thd"]["agage"]["n2o"]["10m"] assert table["thd"]["agage"]["ccl4"]["10m"]
def data_read(): get_local_bucket(empty=True) # DECC network sites network = "DECC" bsd_248_path = get_datapath(filename="bsd.picarro.1minute.248m.min.dat", data_type="CRDS") bsd_108_path = get_datapath(filename="bsd.picarro.1minute.108m.min.dat", data_type="CRDS") bsd_42_path = get_datapath(filename="bsd.picarro.1minute.42m.min.dat", data_type="CRDS") bsd_paths = [bsd_248_path, bsd_108_path, bsd_42_path] bsd_results = ObsSurface.read_file(filepath=bsd_paths, data_type="CRDS", site="bsd", network=network) hfd_100_path = get_datapath(filename="hfd.picarro.1minute.100m.min.dat", data_type="CRDS") hfd_50_path = get_datapath(filename="hfd.picarro.1minute.50m.min.dat", data_type="CRDS") hfd_paths = [hfd_100_path, hfd_50_path] ObsSurface.read_file(filepath=hfd_paths, data_type="CRDS", site="hfd", network=network) tac_path = get_datapath(filename="tac.picarro.1minute.100m.test.dat", data_type="CRDS") ObsSurface.read_file(filepath=tac_path, data_type="CRDS", site="tac", network=network) # GCWERKS data (AGAGE network sites) data_filepath = get_datapath(filename="capegrim-medusa.18.C", data_type="GC") prec_filepath = get_datapath(filename="capegrim-medusa.18.precisions.C", data_type="GC") ObsSurface.read_file(filepath=(data_filepath, prec_filepath), site="CGO", data_type="GCWERKS", network="AGAGE") mhd_data_filepath = get_datapath(filename="macehead.12.C", data_type="GC") mhd_prec_filepath = get_datapath(filename="macehead.12.precisions.C", data_type="GC") ObsSurface.read_file(filepath=(mhd_data_filepath, mhd_prec_filepath), site="MHD", data_type="GCWERKS", network="AGAGE", instrument="GCMD") # Set ranking information for BSD obs = ObsSurface.load() uid_248 = bsd_results["processed"]["bsd.picarro.1minute.248m.min.dat"]["ch4"] obs.set_rank(uuid=uid_248, rank=1, date_range="2012-01-01_2013-01-01") uid_108 = bsd_results["processed"]["bsd.picarro.1minute.108m.min.dat"]["ch4"] obs.set_rank(uuid=uid_108, rank=1, date_range="2014-09-02_2014-11-01") obs.set_rank(uuid=uid_248, rank=1, date_range="2015-01-01_2015-11-01") obs.set_rank(uuid=uid_108, rank=1, date_range="2016-09-02_2018-11-01") uid_42 = bsd_results["processed"]["bsd.picarro.1minute.42m.min.dat"]["ch4"] obs.set_rank(uuid=uid_42, rank=1, date_range="2019-01-02_2021-01-01") # Emissions data test_datapath = get_emissions_datapath("co2-gpp-cardamom-mth_EUROPE_2012.nc") Emissions.read_file( filepath=test_datapath, species="co2", source="gpp-cardamom", date="2012", domain="europe", high_time_resolution=False, ) # Footprint data datapath = get_footprint_datapath("footprint_test.nc") site = "TMB" network = "LGHG" height = "10m" domain = "EUROPE" model = "test_model" Footprints.read_file( filepath=datapath, site=site, model=model, network=network, height=height, domain=domain )