Esempio n. 1
0
def test_read_same_footprint_twice_raises():
    datapath = get_footprint_datapath("footprint_test.nc")

    with pytest.raises(ValueError):
        Footprints.read_file(
            filepath=datapath,
            site="TMB",
            model="test_model",
            network="LGHG",
            domain="EUROPE",
            height="10magl",
        )
Esempio n. 2
0
def data_read():
    '''
    Data set up for running tests for these sets of modules.
    '''
    get_local_bucket(empty=True)

    # Files for creating forward model (mf_mod) for methane at TAC site

    # Observation data
    #  - TAC at 100m for 201208
    site = "tac"
    network = "DECC"
    data_type = "CRDS"

    tac_path = get_datapath(filename="tac.picarro.1minute.100m.201208.dat",
                            data_type="CRDS")
    ObsSurface.read_file(filepath=tac_path,
                         data_type=data_type,
                         site=site,
                         network=network)

    # Emissions data
    # Anthropogenic ch4 (methane) data from 2012 for EUROPE
    species = "ch4"
    source = "anthro"
    domain = "EUROPE"

    emissions_datapath = get_emissions_datapath("ch4-anthro_EUROPE_2012.nc")

    Emissions.read_file(
        filepath=emissions_datapath,
        species=species,
        source=source,
        date="2012",
        domain=domain,
        high_time_resolution=False,
    )

    # Footprint data
    # TAC footprint from 2012-08 - 2012-09 at 100m
    height = "100m"
    model = "NAME"

    fp_datapath = get_footprint_datapath("TAC-100magl_EUROPE_201208.nc")

    Footprints.read_file(filepath=fp_datapath,
                         site=site,
                         model=model,
                         network=network,
                         height=height,
                         domain=domain)
Esempio n. 3
0
def test_set_lookup_uuids():
    f = Footprints()

    fake_uuid = "123456789"

    site = "test_site"
    domain = "test_domain"
    model = "test_model"
    height = "test_height"

    f.set_uuid(site=site, domain=domain, model=model, height=height, uuid=fake_uuid)

    found_uid = f.lookup_uuid(site=site, domain=domain, model=model, height=height)

    assert f._datasource_table[site][domain][model][height] == found_uid == fake_uuid
Esempio n. 4
0
def co2_setup():
    get_local_bucket(empty=True)

    data_type = "CRDS"

    tac_file = get_datapath(filename="tac.picarro.hourly.100m.test.dat",
                            data_type=data_type)
    tac_footprint = get_fp_datapath("TAC-100magl_UKV_co2_TEST_201407.nc")
    co2_emissions = get_flux_datapath("co2-rtot-cardamom-2hr_TEST_2014.nc")

    site = "tac"
    species = "co2"
    network = "DECC"
    height = "100m"

    domain = "TEST"
    model = "NAME"
    metmodel = "UKV"

    source = "rtot-cardamom"
    date = "2014"

    ObsSurface.read_file(filepath=tac_file,
                         data_type=data_type,
                         site=site,
                         network=network,
                         inlet=height)

    Footprints.read_file(filepath=tac_footprint,
                         site=site,
                         height=height,
                         domain=domain,
                         model=model,
                         metmodel=metmodel,
                         species=species)

    Emissions.read_file(filepath=co2_emissions,
                        species=species,
                        source=source,
                        domain=domain,
                        date=date,
                        high_time_resolution=True)
Esempio n. 5
0
def test_wrong_uuid_raises():
    f = Footprints()

    fake_datasource = {"tmb_lghg_10m_europe": "mock-uuid-123456"}

    fake_metadata = {
        "tmb_lghg_10m_europe": {
            "data_type": "footprints",
            "site": "tmb",
            "height": "10m",
            "domain": "europe",
            "model": "test_model",
            "network": "lghg",
        }
    }

    f.add_datasources(datasource_uuids=fake_datasource, metadata=fake_metadata)

    assert f.datasources() == ["mock-uuid-123456"]

    changed_datasource = {"tmb_lghg_10m_europe": "mock-uuid-8888888"}

    with pytest.raises(ValueError):
        f.add_datasources(datasource_uuids=changed_datasource, metadata=fake_metadata)
Esempio n. 6
0
def test_datasource_add_lookup():
    f = Footprints()

    fake_datasource = {"tmb_lghg_10m_europe": "mock-uuid-123456"}

    fake_metadata = {
        "tmb_lghg_10m_europe": {
            "data_type": "footprints",
            "site": "tmb",
            "height": "10m",
            "domain": "europe",
            "model": "test_model",
            "network": "lghg",
        }
    }

    f.add_datasources(datasource_uuids=fake_datasource, metadata=fake_metadata)

    assert f.datasources() == ["mock-uuid-123456"]

    lookup = f.datasource_lookup(fake_metadata)

    assert lookup == fake_datasource
Esempio n. 7
0
def search(**kwargs):  # type: ignore
    """Search for observations data. Any keyword arguments may be passed to the
    the function and these keywords will be used to search the metadata associated
    with each Datasource.

    Example / commonly used arguments are given below.

    Args:
        species: Terms to search for in Datasources
        locations: Where to search for the terms in species
        inlet: Inlet height such as 100m
        instrument: Instrument name such as picarro
        find_all: Require all search terms to be satisfied
        start_date: Start datetime for search.
        If None a start datetime of UNIX epoch (1970-01-01) is set
        end_date: End datetime for search.
        If None an end datetime of the current datetime is set
        skip_ranking: If True skip ranking system, defaults to False
    Returns:
        dict: List of keys of Datasources matching the search parameters
    """
    from addict import Dict as aDict
    from copy import deepcopy
    from itertools import chain as iter_chain

    from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel
    from openghg.store.base import Datasource

    from openghg.util import (
        timestamp_now,
        timestamp_epoch,
        timestamp_tzaware,
        clean_string,
        closest_daterange,
        find_daterange_gaps,
        split_daterange_str,
        load_json,
    )
    from openghg.dataobjects import SearchResults

    # Get a copy of kwargs as we make some modifications below
    kwargs_copy = deepcopy(kwargs)

    # Do this here otherwise we have to produce them for every datasource
    start_date = kwargs.get("start_date")
    end_date = kwargs.get("end_date")

    if start_date is None:
        start_date = timestamp_epoch()
    else:
        start_date = timestamp_tzaware(start_date)

    if end_date is None:
        end_date = timestamp_now()
    else:
        end_date = timestamp_tzaware(end_date)

    kwargs_copy["start_date"] = start_date
    kwargs_copy["end_date"] = end_date

    skip_ranking = kwargs_copy.get("skip_ranking", False)

    try:
        del kwargs_copy["skip_ranking"]
    except KeyError:
        pass

    # As we might have kwargs that are None we want to get rid of those
    search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None}

    # Speices translation

    species = search_kwargs.get("species")

    if species is not None:
        if not isinstance(species, list):
            species = [species]

        translator = load_json("species_translator.json")

        updated_species = []

        for s in species:
            updated_species.append(s)

            try:
                translated = translator[s]
            except KeyError:
                pass
            else:
                updated_species.extend(translated)

        search_kwargs["species"] = updated_species

    data_type = search_kwargs.get("data_type", "timeseries")

    valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model")
    if data_type not in valid_data_types:
        raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}")

    # Assume we want timeseries data
    obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load()

    if data_type == "footprints":
        obj = Footprints.load()
    elif data_type == "emissions":
        obj = Emissions.load()
    elif data_type == "eulerian_model":
        obj = EulerianModel.load()

    datasource_uuids = obj.datasources()

    # Shallow load the Datasources so we can search their metadata
    datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

    # For the time being this will return a dict until we know how best to represent
    # the footprints and emissions results in a SearchResult object
    if data_type in {"emissions", "footprints", "eulerian_model"}:
        sources: Dict = aDict()
        for datasource in datasources:
            if datasource.search_metadata(**search_kwargs):
                uid = datasource.uuid()
                sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)
                sources[uid]["metadata"] = datasource.metadata()

        return sources

    # Find the Datasources that contain matching metadata
    matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)}

    # TODO - Update this as it only uses the ACRG repo JSON at the moment
    # Check if this site only has one inlet, if so skip ranking
    # if "site" in search_kwargs:
    #     site = search_kwargs["site"]
    #     if not isinstance(site, list) and not multiple_inlets(site=site):
    #         skip_ranking = True

    # If there isn't *any* ranking data at all, skip all the ranking functionality
    if not obj._rank_data:
        skip_ranking = True

    # If only one datasource has been returned, skip all the ranking functionality
    if len(matching_sources) == 1:
        skip_ranking = True

    # If we have the site, inlet and instrument then just return the data
    # TODO - should instrument be added here
    if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True:
        specific_sources = aDict()
        for datasource in matching_sources.values():
            specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)

            if not specific_keys:
                continue

            metadata = datasource.metadata()

            site = metadata["site"]
            species = metadata["species"]
            inlet = metadata["inlet"]

            specific_sources[site][species][inlet]["keys"] = specific_keys
            specific_sources[site][species][inlet]["metadata"] = metadata

        return SearchResults(results=specific_sources.to_dict(), ranked_data=False)

    highest_ranked = aDict()

    for uid, datasource in matching_sources.items():
        # Find the site and then the ranking
        metadata = datasource.metadata()
        # Get the site inlet and species
        site = metadata["site"]
        species = metadata["species"]

        rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date)

        # If this Datasource doesn't have any ranking data skip it and move on
        if not rank_data:
            continue

        # There will only be a single rank key
        rank_value = next(iter(rank_data))
        # Get the daterange this rank covers
        rank_dateranges = rank_data[rank_value]

        # Each match we store gives us the information we need
        # to retrieve the data
        match = {"uuid": uid, "dateranges": rank_dateranges}

        # Need to ensure we get all the dates covered
        if species in highest_ranked[site]:
            species_rank_data = highest_ranked[site][species]

            # If we have a higher (lower number) rank save it
            if rank_value < species_rank_data["rank"]:
                species_rank_data["rank"] = rank_value
                species_rank_data["matching"] = [match]
            # If another Datasource has the same rank for another daterange
            # we want to save that as well
            elif rank_value == species_rank_data["rank"]:
                species_rank_data["matching"].append(match)
        else:
            highest_ranked[site][species]["rank"] = rank_value
            highest_ranked[site][species]["matching"] = [match]

    if not highest_ranked:
        raise ValueError(
            (
                "No ranking data set for the given search parameters."
                " Please refine your search to include a specific site, species and inlet."
            )
        )
    # Now we have the highest ranked data the dateranges there are ranks for
    # we want to fill in the gaps with (currently) the highest inlet from that site

    # We just want some rank_metadata to go along with the final data scheme
    # Can key a key of date - inlet
    data_keys: Dict = aDict()
    for site, species in highest_ranked.items():
        for sp, data in species.items():
            # data_keys[site][sp]["keys"] = []

            species_keys = []
            species_rank_data = {}
            species_metadata = {}

            for match_data in data["matching"]:
                uuid = match_data["uuid"]
                match_dateranges = match_data["dateranges"]
                # Get the datasource as it's already in the dictionary
                # we created earlier
                datasource = matching_sources[uuid]
                metadata = datasource.metadata()
                inlet = metadata["inlet"]

                keys = []
                for dr in match_dateranges:
                    date_keys = datasource.keys_in_daterange_str(daterange=dr)

                    if date_keys:
                        keys.extend(date_keys)
                        # We'll add this to the metadata in the search results we return at the end
                        species_rank_data[dr] = inlet

                species_keys.extend(keys)
                species_metadata[inlet] = metadata

            # Only create the dictionary keys if we have some data keys
            if species_keys:
                data_keys[site][sp]["keys"] = species_keys
                data_keys[site][sp]["rank_metadata"] = species_rank_data
                data_keys[site][sp]["metadata"] = species_metadata
            else:
                continue

            # We now need to retrieve data for the dateranges for which we don't have ranking data
            # To do this find the gaps in the daterange over which the user has requested data
            # and the dates for which we have ranking information

            # Get the dateranges that are covered by ranking information
            daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]]))
            # Find the gaps in the ranking coverage
            gap_dateranges = find_daterange_gaps(
                start_search=start_date, end_search=end_date, dateranges=daterange_strs
            )

            # We want the dateranges and inlets for those dateranges
            inlet_dateranges = data_keys[site][sp]["rank_metadata"]
            # These are the dateranges for which we have ranking information for this site and species
            ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys())

            for gap_daterange in gap_dateranges:
                # We want to select the inlet that's ranked for dates closest to the ones we have here
                closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges)

                gap_start, gap_end = split_daterange_str(gap_daterange)
                # Find the closest ranked inlet by date
                chosen_inlet = inlet_dateranges[closest_dr]

                inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet]
                inlet_instrument = inlet_metadata["instrument"]
                inlet_sampling_period = inlet_metadata["sampling_period"]

                # Then we want to retrieve the correct metadata for those inlets
                results: SearchResults = search(
                    site=site,
                    species=sp,
                    inlet=chosen_inlet,
                    instrument=inlet_instrument,
                    sampling_period=inlet_sampling_period,
                    start_date=gap_start,
                    end_date=gap_end,
                )  # type: ignore

                if not results:
                    continue

                # Retrieve the data keys
                inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet)

                data_keys[site][sp]["keys"].extend(inlet_data_keys)

            # Remove any duplicate keys
            data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"]))

    # TODO - create a stub for addict
    dict_data_keys = data_keys.to_dict()  # type: ignore

    return SearchResults(results=dict_data_keys, ranked_data=True)
Esempio n. 8
0
def data_read():
    get_local_bucket(empty=True)

    # DECC network sites
    network = "DECC"
    bsd_248_path = get_datapath(filename="bsd.picarro.1minute.248m.min.dat", data_type="CRDS")
    bsd_108_path = get_datapath(filename="bsd.picarro.1minute.108m.min.dat", data_type="CRDS")
    bsd_42_path = get_datapath(filename="bsd.picarro.1minute.42m.min.dat", data_type="CRDS")

    bsd_paths = [bsd_248_path, bsd_108_path, bsd_42_path]

    bsd_results = ObsSurface.read_file(filepath=bsd_paths, data_type="CRDS", site="bsd", network=network)

    hfd_100_path = get_datapath(filename="hfd.picarro.1minute.100m.min.dat", data_type="CRDS")
    hfd_50_path = get_datapath(filename="hfd.picarro.1minute.50m.min.dat", data_type="CRDS")
    hfd_paths = [hfd_100_path, hfd_50_path]

    ObsSurface.read_file(filepath=hfd_paths, data_type="CRDS", site="hfd", network=network)

    tac_path = get_datapath(filename="tac.picarro.1minute.100m.test.dat", data_type="CRDS")
    ObsSurface.read_file(filepath=tac_path, data_type="CRDS", site="tac", network=network)

    # GCWERKS data (AGAGE network sites)
    data_filepath = get_datapath(filename="capegrim-medusa.18.C", data_type="GC")
    prec_filepath = get_datapath(filename="capegrim-medusa.18.precisions.C", data_type="GC")

    ObsSurface.read_file(filepath=(data_filepath, prec_filepath), site="CGO", data_type="GCWERKS", network="AGAGE")

    mhd_data_filepath = get_datapath(filename="macehead.12.C", data_type="GC")
    mhd_prec_filepath = get_datapath(filename="macehead.12.precisions.C", data_type="GC")

    ObsSurface.read_file(filepath=(mhd_data_filepath, mhd_prec_filepath), site="MHD", data_type="GCWERKS", network="AGAGE", instrument="GCMD")

    # Set ranking information for BSD
    obs = ObsSurface.load()

    uid_248 = bsd_results["processed"]["bsd.picarro.1minute.248m.min.dat"]["ch4"]
    obs.set_rank(uuid=uid_248, rank=1, date_range="2012-01-01_2013-01-01")

    uid_108 = bsd_results["processed"]["bsd.picarro.1minute.108m.min.dat"]["ch4"]
    obs.set_rank(uuid=uid_108, rank=1, date_range="2014-09-02_2014-11-01")

    obs.set_rank(uuid=uid_248, rank=1, date_range="2015-01-01_2015-11-01")

    obs.set_rank(uuid=uid_108, rank=1, date_range="2016-09-02_2018-11-01")

    uid_42 = bsd_results["processed"]["bsd.picarro.1minute.42m.min.dat"]["ch4"]
    obs.set_rank(uuid=uid_42, rank=1, date_range="2019-01-02_2021-01-01")

    # Emissions data
    test_datapath = get_emissions_datapath("co2-gpp-cardamom-mth_EUROPE_2012.nc")

    Emissions.read_file(
        filepath=test_datapath,
        species="co2",
        source="gpp-cardamom",
        date="2012",
        domain="europe",
        high_time_resolution=False,
    )

    # Footprint data
    datapath = get_footprint_datapath("footprint_test.nc")

    site = "TMB"
    network = "LGHG"
    height = "10m"
    domain = "EUROPE"
    model = "test_model"

    Footprints.read_file(
        filepath=datapath, site=site, model=model, network=network, height=height, domain=domain
    )
Esempio n. 9
0
def test_read_footprint():
    get_local_bucket(empty=True)

    datapath = get_footprint_datapath("footprint_test.nc")
    # model_params = {"simulation_params": "123"}

    site = "TMB"
    network = "LGHG"
    height = "10m"
    domain = "EUROPE"
    model = "test_model"

    Footprints.read_file(
        filepath=datapath, site=site, model=model, network=network, height=height, domain=domain
    )

    # Get the footprints data
    footprint_results = search(site=site, domain=domain, data_type="footprints")

    fp_site_key = list(footprint_results.keys())[0]

    footprint_keys = footprint_results[fp_site_key]["keys"]
    footprint_data = recombine_datasets(keys=footprint_keys, sort=False)

    footprint_coords = list(footprint_data.coords.keys())
    footprint_dims = list(footprint_data.dims)

    # Sorting to allow comparison - coords / dims can be stored in different orders
    # depending on how the Dataset has been manipulated
    footprint_coords.sort()
    footprint_dims.sort()

    assert footprint_coords == ["height", "lat", "lat_high", "lev", "lon", "lon_high", "time"]
    assert footprint_dims == ["height", "index", "lat", "lat_high", "lev", "lon", "lon_high", "time"]

    assert (
        footprint_data.attrs["heights"]
        == [
            500.0,
            1500.0,
            2500.0,
            3500.0,
            4500.0,
            5500.0,
            6500.0,
            7500.0,
            8500.0,
            9500.0,
            10500.0,
            11500.0,
            12500.0,
            13500.0,
            14500.0,
            15500.0,
            16500.0,
            17500.0,
            18500.0,
            19500.0,
        ]
    ).all()

    assert footprint_data.attrs["variables"] == [
        "fp",
        "temperature",
        "pressure",
        "wind_speed",
        "wind_direction",
        "PBLH",
        "release_lon",
        "release_lat",
        "particle_locations_n",
        "particle_locations_e",
        "particle_locations_s",
        "particle_locations_w",
        "mean_age_particles_n",
        "mean_age_particles_e",
        "mean_age_particles_s",
        "mean_age_particles_w",
        "fp_low",
        "fp_high",
        "index_lons",
        "index_lats",
    ]

    del footprint_data.attrs["processed"]
    del footprint_data.attrs["heights"]
    del footprint_data.attrs["variables"]

    expected_attrs = {
        "author": "OpenGHG Cloud",
        "data_type": "footprints",
        "site": "tmb",
        "network": "lghg",
        "height": "10m",
        "model": "test_model",
        "domain": "europe",
        "start_date": "2020-08-01 00:00:00+00:00",
        "end_date": "2020-08-01 00:00:00+00:00",
        "max_longitude": 39.38,
        "min_longitude": -97.9,
        "max_latitude": 79.057,
        "min_latitude": 10.729,
        "time_resolution": "standard_time_resolution",
    }

    assert footprint_data.attrs == expected_attrs

    footprint_data["fp_low"].max().values == pytest.approx(0.43350983)
    footprint_data["fp_high"].max().values == pytest.approx(0.11853027)
    footprint_data["pressure"].max().values == pytest.approx(1011.92)
    footprint_data["fp_low"].min().values == 0.0
    footprint_data["fp_high"].min().values == 0.0
    footprint_data["pressure"].min().values == pytest.approx(1011.92)