Ejemplo n.º 1
0
def test_load_dataset():
    filename = "WAO-20magl_EUROPE_201306_small.nc"
    dir_path = os.path.dirname(__file__)
    test_data = "../data/emissions"
    filepath = os.path.join(dir_path, test_data, filename)

    ds = xr.load_dataset(filepath)

    metadata = {"some": "metadata"}

    d = Datasource()

    d.add_data(metadata=metadata, data=ds, data_type="footprints")

    d.save()

    keys = d._data_keys["latest"]["keys"]

    key = list(keys.values())[0]

    bucket = get_local_bucket()

    loaded_ds = Datasource.load_dataset(bucket=bucket, key=key)

    assert loaded_ds.equals(ds)
Ejemplo n.º 2
0
def test_add_data(data):
    d = Datasource()

    metadata = data["ch4"]["metadata"]
    ch4_data = data["ch4"]["data"]

    assert ch4_data["ch4"][0] == pytest.approx(1959.55)
    assert ch4_data["ch4_variability"][0] == pytest.approx(0.79)
    assert ch4_data["ch4_number_of_observations"][0] == pytest.approx(26.0)

    d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries")
    d.save()
    bucket = get_local_bucket()

    data_chunks = [
        Datasource.load_dataset(bucket=bucket, key=k) for k in d.data_keys()
    ]

    # Now read it out and make sure it's what we expect
    combined = xr.concat(data_chunks, dim="time")

    assert combined.equals(ch4_data)

    expected_metadata = {
        "site": "bsd",
        "instrument": "picarro",
        "sampling_period": "60",
        "inlet": "248m",
        "port": "9",
        "type": "air",
        "network": "decc",
        "species": "ch4",
        "scale": "wmo-x2004a",
        "long_name": "bilsdale",
        "data_owner": "simon o'doherty",
        "data_owner_email": "*****@*****.**",
        "inlet_height_magl": "248m",
        "comment": "cavity ring-down measurements. output from gcwerks",
        "source": "in situ measurements of air",
        "conventions": "cf-1.6",
        "calibration_scale": "wmo-x2004a",
        "station_longitude": -1.15033,
        "station_latitude": 54.35858,
        "station_long_name": "bilsdale, uk",
        "station_height_masl": 380.0,
        "data_type": "timeseries",
    }

    assert d.metadata() == expected_metadata
Ejemplo n.º 3
0
def recombine_datasets(
    keys: List[str],
    sort: Optional[bool] = True,
    attrs_to_check: Union[str, List[str], Dict[str, str], None] = None,
) -> Dataset:
    """Combines datasets stored separately in the object store
    into a single dataset

    Args:
        keys: List of object store keys
        sort: Sort the resulting Dataset by the time dimension. Default = True
        attrs_to_check: Attributes to check for duplicates. If duplicates are present
            a new data variable will be created containing the values from each dataset
            If a dictionary is passed, the attribute(s) will be retained and the new value assigned.
            If a list/string is passed, the attribute(s) will be removed.
    Returns:
        xarray.Dataset: Combined Dataset
    """
    from xarray import concat as xr_concat
    from openghg.store.base import Datasource
    from openghg.objectstore import get_bucket

    if not keys:
        raise ValueError("No data keys passed.")

    bucket = get_bucket()

    data = [Datasource.load_dataset(bucket=bucket, key=k) for k in keys]

    if attrs_to_check is None:
        attrs_to_check = {"inlet": "multiple"}

    # For specified attributes (e.g. "inlet")
    # elevate duplicates to data variables within each Dataset
    if attrs_to_check:
        if isinstance(attrs_to_check, dict):
            attributes = list(attrs_to_check.keys())
            replace_values = list(attrs_to_check.values())
        elif isinstance(attrs_to_check, str):
            attributes = [attrs_to_check]
            replace_values = [""]
        else:
            attributes = attrs_to_check
            replace_values = [""] * len(attributes)

        data = elevate_duplicate_attrs(ds_list=data, attributes=attributes)

    # Concatenate datasets along time dimension
    combined = xr_concat(data, dim="time")

    # Replace/remove incorrect attributes
    #  - xr.concat will only take value from first dataset if duplicated
    if attrs_to_check:
        for attr, value in zip(attributes, replace_values):
            if attr in combined:  # Only update if attr was elevated to a data variable
                if value:
                    combined.attrs[attr] = value
                else:
                    combined.attrs.pop(attr)

    if sort:
        combined = combined.sortby("time")

    # Check for duplicates?
    # This is taken from https://stackoverflow.com/questions/51058379/drop-duplicate-times-in-xarray
    # _, index = np.unique(combined['time'], return_index=True)
    # combined = combined.isel(time=index)

    return combined