def test_read_same_footprint_twice_raises(): datapath = get_footprint_datapath("footprint_test.nc") with pytest.raises(ValueError): Footprints.read_file( filepath=datapath, site="TMB", model="test_model", network="LGHG", domain="EUROPE", height="10magl", )
def data_read(): ''' Data set up for running tests for these sets of modules. ''' get_local_bucket(empty=True) # Files for creating forward model (mf_mod) for methane at TAC site # Observation data # - TAC at 100m for 201208 site = "tac" network = "DECC" data_type = "CRDS" tac_path = get_datapath(filename="tac.picarro.1minute.100m.201208.dat", data_type="CRDS") ObsSurface.read_file(filepath=tac_path, data_type=data_type, site=site, network=network) # Emissions data # Anthropogenic ch4 (methane) data from 2012 for EUROPE species = "ch4" source = "anthro" domain = "EUROPE" emissions_datapath = get_emissions_datapath("ch4-anthro_EUROPE_2012.nc") Emissions.read_file( filepath=emissions_datapath, species=species, source=source, date="2012", domain=domain, high_time_resolution=False, ) # Footprint data # TAC footprint from 2012-08 - 2012-09 at 100m height = "100m" model = "NAME" fp_datapath = get_footprint_datapath("TAC-100magl_EUROPE_201208.nc") Footprints.read_file(filepath=fp_datapath, site=site, model=model, network=network, height=height, domain=domain)
def test_set_lookup_uuids(): f = Footprints() fake_uuid = "123456789" site = "test_site" domain = "test_domain" model = "test_model" height = "test_height" f.set_uuid(site=site, domain=domain, model=model, height=height, uuid=fake_uuid) found_uid = f.lookup_uuid(site=site, domain=domain, model=model, height=height) assert f._datasource_table[site][domain][model][height] == found_uid == fake_uuid
def co2_setup(): get_local_bucket(empty=True) data_type = "CRDS" tac_file = get_datapath(filename="tac.picarro.hourly.100m.test.dat", data_type=data_type) tac_footprint = get_fp_datapath("TAC-100magl_UKV_co2_TEST_201407.nc") co2_emissions = get_flux_datapath("co2-rtot-cardamom-2hr_TEST_2014.nc") site = "tac" species = "co2" network = "DECC" height = "100m" domain = "TEST" model = "NAME" metmodel = "UKV" source = "rtot-cardamom" date = "2014" ObsSurface.read_file(filepath=tac_file, data_type=data_type, site=site, network=network, inlet=height) Footprints.read_file(filepath=tac_footprint, site=site, height=height, domain=domain, model=model, metmodel=metmodel, species=species) Emissions.read_file(filepath=co2_emissions, species=species, source=source, domain=domain, date=date, high_time_resolution=True)
def test_wrong_uuid_raises(): f = Footprints() fake_datasource = {"tmb_lghg_10m_europe": "mock-uuid-123456"} fake_metadata = { "tmb_lghg_10m_europe": { "data_type": "footprints", "site": "tmb", "height": "10m", "domain": "europe", "model": "test_model", "network": "lghg", } } f.add_datasources(datasource_uuids=fake_datasource, metadata=fake_metadata) assert f.datasources() == ["mock-uuid-123456"] changed_datasource = {"tmb_lghg_10m_europe": "mock-uuid-8888888"} with pytest.raises(ValueError): f.add_datasources(datasource_uuids=changed_datasource, metadata=fake_metadata)
def test_datasource_add_lookup(): f = Footprints() fake_datasource = {"tmb_lghg_10m_europe": "mock-uuid-123456"} fake_metadata = { "tmb_lghg_10m_europe": { "data_type": "footprints", "site": "tmb", "height": "10m", "domain": "europe", "model": "test_model", "network": "lghg", } } f.add_datasources(datasource_uuids=fake_datasource, metadata=fake_metadata) assert f.datasources() == ["mock-uuid-123456"] lookup = f.datasource_lookup(fake_metadata) assert lookup == fake_datasource
def search(**kwargs): # type: ignore """Search for observations data. Any keyword arguments may be passed to the the function and these keywords will be used to search the metadata associated with each Datasource. Example / commonly used arguments are given below. Args: species: Terms to search for in Datasources locations: Where to search for the terms in species inlet: Inlet height such as 100m instrument: Instrument name such as picarro find_all: Require all search terms to be satisfied start_date: Start datetime for search. If None a start datetime of UNIX epoch (1970-01-01) is set end_date: End datetime for search. If None an end datetime of the current datetime is set skip_ranking: If True skip ranking system, defaults to False Returns: dict: List of keys of Datasources matching the search parameters """ from addict import Dict as aDict from copy import deepcopy from itertools import chain as iter_chain from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel from openghg.store.base import Datasource from openghg.util import ( timestamp_now, timestamp_epoch, timestamp_tzaware, clean_string, closest_daterange, find_daterange_gaps, split_daterange_str, load_json, ) from openghg.dataobjects import SearchResults # Get a copy of kwargs as we make some modifications below kwargs_copy = deepcopy(kwargs) # Do this here otherwise we have to produce them for every datasource start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") if start_date is None: start_date = timestamp_epoch() else: start_date = timestamp_tzaware(start_date) if end_date is None: end_date = timestamp_now() else: end_date = timestamp_tzaware(end_date) kwargs_copy["start_date"] = start_date kwargs_copy["end_date"] = end_date skip_ranking = kwargs_copy.get("skip_ranking", False) try: del kwargs_copy["skip_ranking"] except KeyError: pass # As we might have kwargs that are None we want to get rid of those search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None} # Speices translation species = search_kwargs.get("species") if species is not None: if not isinstance(species, list): species = [species] translator = load_json("species_translator.json") updated_species = [] for s in species: updated_species.append(s) try: translated = translator[s] except KeyError: pass else: updated_species.extend(translated) search_kwargs["species"] = updated_species data_type = search_kwargs.get("data_type", "timeseries") valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model") if data_type not in valid_data_types: raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}") # Assume we want timeseries data obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load() if data_type == "footprints": obj = Footprints.load() elif data_type == "emissions": obj = Emissions.load() elif data_type == "eulerian_model": obj = EulerianModel.load() datasource_uuids = obj.datasources() # Shallow load the Datasources so we can search their metadata datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) # For the time being this will return a dict until we know how best to represent # the footprints and emissions results in a SearchResult object if data_type in {"emissions", "footprints", "eulerian_model"}: sources: Dict = aDict() for datasource in datasources: if datasource.search_metadata(**search_kwargs): uid = datasource.uuid() sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) sources[uid]["metadata"] = datasource.metadata() return sources # Find the Datasources that contain matching metadata matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)} # TODO - Update this as it only uses the ACRG repo JSON at the moment # Check if this site only has one inlet, if so skip ranking # if "site" in search_kwargs: # site = search_kwargs["site"] # if not isinstance(site, list) and not multiple_inlets(site=site): # skip_ranking = True # If there isn't *any* ranking data at all, skip all the ranking functionality if not obj._rank_data: skip_ranking = True # If only one datasource has been returned, skip all the ranking functionality if len(matching_sources) == 1: skip_ranking = True # If we have the site, inlet and instrument then just return the data # TODO - should instrument be added here if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True: specific_sources = aDict() for datasource in matching_sources.values(): specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) if not specific_keys: continue metadata = datasource.metadata() site = metadata["site"] species = metadata["species"] inlet = metadata["inlet"] specific_sources[site][species][inlet]["keys"] = specific_keys specific_sources[site][species][inlet]["metadata"] = metadata return SearchResults(results=specific_sources.to_dict(), ranked_data=False) highest_ranked = aDict() for uid, datasource in matching_sources.items(): # Find the site and then the ranking metadata = datasource.metadata() # Get the site inlet and species site = metadata["site"] species = metadata["species"] rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date) # If this Datasource doesn't have any ranking data skip it and move on if not rank_data: continue # There will only be a single rank key rank_value = next(iter(rank_data)) # Get the daterange this rank covers rank_dateranges = rank_data[rank_value] # Each match we store gives us the information we need # to retrieve the data match = {"uuid": uid, "dateranges": rank_dateranges} # Need to ensure we get all the dates covered if species in highest_ranked[site]: species_rank_data = highest_ranked[site][species] # If we have a higher (lower number) rank save it if rank_value < species_rank_data["rank"]: species_rank_data["rank"] = rank_value species_rank_data["matching"] = [match] # If another Datasource has the same rank for another daterange # we want to save that as well elif rank_value == species_rank_data["rank"]: species_rank_data["matching"].append(match) else: highest_ranked[site][species]["rank"] = rank_value highest_ranked[site][species]["matching"] = [match] if not highest_ranked: raise ValueError( ( "No ranking data set for the given search parameters." " Please refine your search to include a specific site, species and inlet." ) ) # Now we have the highest ranked data the dateranges there are ranks for # we want to fill in the gaps with (currently) the highest inlet from that site # We just want some rank_metadata to go along with the final data scheme # Can key a key of date - inlet data_keys: Dict = aDict() for site, species in highest_ranked.items(): for sp, data in species.items(): # data_keys[site][sp]["keys"] = [] species_keys = [] species_rank_data = {} species_metadata = {} for match_data in data["matching"]: uuid = match_data["uuid"] match_dateranges = match_data["dateranges"] # Get the datasource as it's already in the dictionary # we created earlier datasource = matching_sources[uuid] metadata = datasource.metadata() inlet = metadata["inlet"] keys = [] for dr in match_dateranges: date_keys = datasource.keys_in_daterange_str(daterange=dr) if date_keys: keys.extend(date_keys) # We'll add this to the metadata in the search results we return at the end species_rank_data[dr] = inlet species_keys.extend(keys) species_metadata[inlet] = metadata # Only create the dictionary keys if we have some data keys if species_keys: data_keys[site][sp]["keys"] = species_keys data_keys[site][sp]["rank_metadata"] = species_rank_data data_keys[site][sp]["metadata"] = species_metadata else: continue # We now need to retrieve data for the dateranges for which we don't have ranking data # To do this find the gaps in the daterange over which the user has requested data # and the dates for which we have ranking information # Get the dateranges that are covered by ranking information daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]])) # Find the gaps in the ranking coverage gap_dateranges = find_daterange_gaps( start_search=start_date, end_search=end_date, dateranges=daterange_strs ) # We want the dateranges and inlets for those dateranges inlet_dateranges = data_keys[site][sp]["rank_metadata"] # These are the dateranges for which we have ranking information for this site and species ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys()) for gap_daterange in gap_dateranges: # We want to select the inlet that's ranked for dates closest to the ones we have here closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges) gap_start, gap_end = split_daterange_str(gap_daterange) # Find the closest ranked inlet by date chosen_inlet = inlet_dateranges[closest_dr] inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet] inlet_instrument = inlet_metadata["instrument"] inlet_sampling_period = inlet_metadata["sampling_period"] # Then we want to retrieve the correct metadata for those inlets results: SearchResults = search( site=site, species=sp, inlet=chosen_inlet, instrument=inlet_instrument, sampling_period=inlet_sampling_period, start_date=gap_start, end_date=gap_end, ) # type: ignore if not results: continue # Retrieve the data keys inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet) data_keys[site][sp]["keys"].extend(inlet_data_keys) # Remove any duplicate keys data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"])) # TODO - create a stub for addict dict_data_keys = data_keys.to_dict() # type: ignore return SearchResults(results=dict_data_keys, ranked_data=True)
def data_read(): get_local_bucket(empty=True) # DECC network sites network = "DECC" bsd_248_path = get_datapath(filename="bsd.picarro.1minute.248m.min.dat", data_type="CRDS") bsd_108_path = get_datapath(filename="bsd.picarro.1minute.108m.min.dat", data_type="CRDS") bsd_42_path = get_datapath(filename="bsd.picarro.1minute.42m.min.dat", data_type="CRDS") bsd_paths = [bsd_248_path, bsd_108_path, bsd_42_path] bsd_results = ObsSurface.read_file(filepath=bsd_paths, data_type="CRDS", site="bsd", network=network) hfd_100_path = get_datapath(filename="hfd.picarro.1minute.100m.min.dat", data_type="CRDS") hfd_50_path = get_datapath(filename="hfd.picarro.1minute.50m.min.dat", data_type="CRDS") hfd_paths = [hfd_100_path, hfd_50_path] ObsSurface.read_file(filepath=hfd_paths, data_type="CRDS", site="hfd", network=network) tac_path = get_datapath(filename="tac.picarro.1minute.100m.test.dat", data_type="CRDS") ObsSurface.read_file(filepath=tac_path, data_type="CRDS", site="tac", network=network) # GCWERKS data (AGAGE network sites) data_filepath = get_datapath(filename="capegrim-medusa.18.C", data_type="GC") prec_filepath = get_datapath(filename="capegrim-medusa.18.precisions.C", data_type="GC") ObsSurface.read_file(filepath=(data_filepath, prec_filepath), site="CGO", data_type="GCWERKS", network="AGAGE") mhd_data_filepath = get_datapath(filename="macehead.12.C", data_type="GC") mhd_prec_filepath = get_datapath(filename="macehead.12.precisions.C", data_type="GC") ObsSurface.read_file(filepath=(mhd_data_filepath, mhd_prec_filepath), site="MHD", data_type="GCWERKS", network="AGAGE", instrument="GCMD") # Set ranking information for BSD obs = ObsSurface.load() uid_248 = bsd_results["processed"]["bsd.picarro.1minute.248m.min.dat"]["ch4"] obs.set_rank(uuid=uid_248, rank=1, date_range="2012-01-01_2013-01-01") uid_108 = bsd_results["processed"]["bsd.picarro.1minute.108m.min.dat"]["ch4"] obs.set_rank(uuid=uid_108, rank=1, date_range="2014-09-02_2014-11-01") obs.set_rank(uuid=uid_248, rank=1, date_range="2015-01-01_2015-11-01") obs.set_rank(uuid=uid_108, rank=1, date_range="2016-09-02_2018-11-01") uid_42 = bsd_results["processed"]["bsd.picarro.1minute.42m.min.dat"]["ch4"] obs.set_rank(uuid=uid_42, rank=1, date_range="2019-01-02_2021-01-01") # Emissions data test_datapath = get_emissions_datapath("co2-gpp-cardamom-mth_EUROPE_2012.nc") Emissions.read_file( filepath=test_datapath, species="co2", source="gpp-cardamom", date="2012", domain="europe", high_time_resolution=False, ) # Footprint data datapath = get_footprint_datapath("footprint_test.nc") site = "TMB" network = "LGHG" height = "10m" domain = "EUROPE" model = "test_model" Footprints.read_file( filepath=datapath, site=site, model=model, network=network, height=height, domain=domain )
def test_read_footprint(): get_local_bucket(empty=True) datapath = get_footprint_datapath("footprint_test.nc") # model_params = {"simulation_params": "123"} site = "TMB" network = "LGHG" height = "10m" domain = "EUROPE" model = "test_model" Footprints.read_file( filepath=datapath, site=site, model=model, network=network, height=height, domain=domain ) # Get the footprints data footprint_results = search(site=site, domain=domain, data_type="footprints") fp_site_key = list(footprint_results.keys())[0] footprint_keys = footprint_results[fp_site_key]["keys"] footprint_data = recombine_datasets(keys=footprint_keys, sort=False) footprint_coords = list(footprint_data.coords.keys()) footprint_dims = list(footprint_data.dims) # Sorting to allow comparison - coords / dims can be stored in different orders # depending on how the Dataset has been manipulated footprint_coords.sort() footprint_dims.sort() assert footprint_coords == ["height", "lat", "lat_high", "lev", "lon", "lon_high", "time"] assert footprint_dims == ["height", "index", "lat", "lat_high", "lev", "lon", "lon_high", "time"] assert ( footprint_data.attrs["heights"] == [ 500.0, 1500.0, 2500.0, 3500.0, 4500.0, 5500.0, 6500.0, 7500.0, 8500.0, 9500.0, 10500.0, 11500.0, 12500.0, 13500.0, 14500.0, 15500.0, 16500.0, 17500.0, 18500.0, 19500.0, ] ).all() assert footprint_data.attrs["variables"] == [ "fp", "temperature", "pressure", "wind_speed", "wind_direction", "PBLH", "release_lon", "release_lat", "particle_locations_n", "particle_locations_e", "particle_locations_s", "particle_locations_w", "mean_age_particles_n", "mean_age_particles_e", "mean_age_particles_s", "mean_age_particles_w", "fp_low", "fp_high", "index_lons", "index_lats", ] del footprint_data.attrs["processed"] del footprint_data.attrs["heights"] del footprint_data.attrs["variables"] expected_attrs = { "author": "OpenGHG Cloud", "data_type": "footprints", "site": "tmb", "network": "lghg", "height": "10m", "model": "test_model", "domain": "europe", "start_date": "2020-08-01 00:00:00+00:00", "end_date": "2020-08-01 00:00:00+00:00", "max_longitude": 39.38, "min_longitude": -97.9, "max_latitude": 79.057, "min_latitude": 10.729, "time_resolution": "standard_time_resolution", } assert footprint_data.attrs == expected_attrs footprint_data["fp_low"].max().values == pytest.approx(0.43350983) footprint_data["fp_high"].max().values == pytest.approx(0.11853027) footprint_data["pressure"].max().values == pytest.approx(1011.92) footprint_data["fp_low"].min().values == 0.0 footprint_data["fp_high"].min().values == 0.0 footprint_data["pressure"].min().values == pytest.approx(1011.92)