Esempio n. 1
0
def _get_site_data(site: str,
                   network: str) -> Tuple[float, float, float, List]:
    """Extract site location data from site attributes file.

    Args:
        site: Site code
    Returns:
        dict: Dictionary of site data
    """
    from openghg.util import load_json

    network = network.upper()
    site = site.upper()

    site_info = load_json("acrg_site_info.json")

    try:
        site_data = site_info[site][network]
        latitude = float(site_data["latitude"])
        longitute = float(site_data["longitude"])
        site_height = float(site_data["height_station_masl"])
        inlet_heights = site_data["height_name"]
    except KeyError as e:
        raise KeyError(f"Incorrect site or network : {e}")

    return latitude, longitute, site_height, inlet_heights
Esempio n. 2
0
def _altitude_to_ecmwf_pressure(measure_pressure: List[float]) -> List[str]:
    """Find out what pressure levels are required from ERA5.

    Args:
        measure_pressure: List of pressures
    Returns:
        list: List of desired pressures
    """
    from openghg.util import load_json

    ecmwf_metadata = load_json("ecmwf_dataset_info.json")
    dataset_metadata = ecmwf_metadata["datasets"]
    valid_levels = dataset_metadata["reanalysis_era5_pressure_levels"][
        "valid_levels"]

    # Available ERA5 pressure levels
    era5_pressure_levels = np.array(valid_levels)

    # Match pressure to ERA5 pressure levels
    ecwmf_pressure_indices = np.zeros(len(measure_pressure) * 2)

    for index, m in enumerate(measure_pressure):
        ecwmf_pressure_indices[(index * 2):(
            index * 2 + 2)] = _two_closest_values(m - era5_pressure_levels)

    desired_era5_pressure = era5_pressure_levels[np.unique(
        ecwmf_pressure_indices).astype(int)]

    pressure_levels: List = desired_era5_pressure.astype(str).tolist()

    return pressure_levels
Esempio n. 3
0
def verify_site(site: str) -> str:
    """Check if the passed site is a valid one and returns the three
    letter site code if found. Otherwise we use fuzzy text matching to suggest
    sites with similar names.

    Args:
        site: Three letter site code or site name
    Returns:
        str: Verified three letter site code if valid site
    """
    from openghg.util import load_json, remove_punctuation
    from openghg.types import InvalidSiteError

    site_data = load_json("site_lookup.json")

    if site.upper() in site_data:
        return site.lower()
    else:
        site = remove_punctuation(site)
        name_lookup: Dict[str, str] = {
            value["short_name"]: code
            for code, value in site_data.items()
        }

        try:
            return name_lookup[site].lower()
        except KeyError:
            long_names = {
                value["long_name"]: code
                for code, value in site_data.items()
            }
            message = find_matching_site(site_name=site,
                                         possible_sites=long_names)
            raise InvalidSiteError(message)
Esempio n. 4
0
def _synonyms(species: str) -> str:
    """
    Check to see if there are other names that we should be using for
    a particular input. E.g. If CFC-11 or CFC11 was input, go on to use cfc-11,
    as this is used in species_info.json

    Args:
        species (str): Input string that you're trying to match
    Returns:
        str: Matched species string
    """

    from openghg.util import load_json

    # Load in the species data
    species_data = load_json(filename="acrg_species_info.json")

    # First test whether site matches keys (case insensitive)
    matched_strings = [k for k in species_data if k.upper() == species.upper()]

    # Used to access the alternative names in species_data
    alt_label = "alt"

    # If not found, search synonyms
    if not matched_strings:
        for key in species_data:
            # Iterate over the alternative labels and check for a match
            matched_strings = [
                s for s in species_data[key][alt_label]
                if s.upper() == species.upper()
            ]

            if matched_strings:
                matched_strings = [key]
                break

    if matched_strings:
        updated_species = str(matched_strings[0])
        return updated_species
    else:
        raise ValueError(f"Unable to find synonym for species {species}")
Esempio n. 5
0
def _site_info_attributes(site: str, network: Optional[str] = None) -> Dict:
    """Reads site attributes from JSON

    Args:
        site: Site code
        network: Network name
    Returns:
        dict: Dictionary of site attributes
    """
    from openghg.util import load_json

    site = site.upper()

    # Read site info file
    data_filename = "acrg_site_info.json"
    site_params = load_json(filename=data_filename)

    if network is None:
        network = list(site_params[site].keys())[0]
    else:
        network = network.upper()

    attributes_dict = {
        "longitude": "station_longitude",
        "latitude": "station_latitude",
        "long_name": "station_long_name",
        "height_station_masl": "station_height_masl",
    }

    attributes = {}
    if site in site_params:
        for attr in attributes_dict:
            if attr in site_params[site][network]:
                attr_key = attributes_dict[attr]

                attributes[attr_key] = site_params[site][network][attr]
    else:
        raise ValueError(f"Invalid site {site} passed. Please use a valid site code such as BSD for Bilsdale")

    return attributes
Esempio n. 6
0
def multiple_inlets(site: str) -> bool:
    """Check if the passed site has more than one inlet

    Args:
        site: Three letter site code
    Returns:
        bool: True if multiple inlets
    """
    from openghg.util import load_json

    site_data = load_json("acrg_site_info.json")

    site = site.upper()
    network = next(iter(site_data[site]))

    try:
        heights = set(site_data[network]["height"])
    except KeyError:
        try:
            heights = set(site_data[network]["height_name"])
        except KeyError:
            return True

    return len(heights) > 1
Esempio n. 7
0
def parse_btt(
    data_filepath: Union[str, Path],
    site: Optional[str] = "BTT",
    network: Optional[str] = "LGHG",
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        dict: Dictionary of gas data
    """
    from openghg.standardise.meta import assign_attributes
    from pandas import read_csv, Timestamp, to_timedelta, isnull
    from numpy import nan as np_nan
    from openghg.util import clean_string, load_json

    # TODO: Decide what to do about inputs which aren't use anywhere
    # at present - inlet, instrument, sampling_period, measurement_type

    data_filepath = Path(data_filepath)

    site = "BTT"

    # Rename these columns
    rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"}
    # We only want these species
    species_extract = ["CO2", "CH4"]
    # Take std-dev measurements from these columns for these species
    species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"}

    param_data = load_json(filename="attributes.json")
    network_params = param_data["BTT"]

    sampling_period = int(network_params["sampling_period"])
    sampling_period_seconds = str(sampling_period) + "s"

    data = read_csv(data_filepath)
    data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta(
        data["DOY"] - 1, unit="D")
    data["time"] = data["time"].dt.round(sampling_period_seconds)
    data = data[~isnull(data.time)]

    data = data.rename(columns=rename_dict)
    data = data.set_index("time")

    gas_data = {}
    for species in species_extract:
        processed_data = data.loc[:, [species]].sort_index()
        # Create a variability column
        species_stddev_label = species_sd[species]
        processed_data[species][f"{species} variability"] = data[
            species_stddev_label]

        # Replace any values below zero with NaNs
        processed_data[processed_data < 0] = np_nan
        # Drop NaNs
        processed_data = processed_data.dropna()
        # Convert to a Dataset
        processed_data = processed_data.to_xarray()

        site_attributes = network_params["global_attributes"]
        site_attributes["inlet_height_magl"] = network_params["inlet"]
        site_attributes["instrument"] = network_params["instrument"]
        site_attributes["sampling_period"] = sampling_period

        # TODO - add in better metadata reading
        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "BTT",
        }

        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
Esempio n. 8
0
def get_obs_surface(
    site: str,
    species: str,
    inlet: str = None,
    start_date: Optional[Union[str, Timestamp]] = None,
    end_date: Optional[Union[str, Timestamp]] = None,
    average: Optional[str] = None,
    network: Optional[str] = None,
    instrument: Optional[str] = None,
    calibration_scale: Optional[str] = None,
    keep_missing: Optional[bool] = False,
    skip_ranking: Optional[bool] = False,
) -> ObsData:
    """Get measurements from one site.

    Args:
        site: Site of interest e.g. MHD for the Mace Head site.
        species: Species identifier e.g. ch4 for methane.
        start_date: Output start date in a format that Pandas can interpret
        end_date: Output end date in a format that Pandas can interpret
        inlet: Inlet label
        average: Averaging period for each dataset. Each value should be a string of
        the form e.g. "2H", "30min" (should match pandas offset aliases format).
        keep_missing: Keep missing data points or drop them.
        network: Network for the site/instrument (must match number of sites).
        instrument: Specific instrument for the site (must match number of sites).
        calibration_scale: Convert to this calibration scale
    Returns:
        ObsData: ObsData object
    """
    from pandas import Timestamp, Timedelta
    import numpy as np
    from xarray import concat as xr_concat
    from openghg.retrieve import search
    from openghg.store import recombine_datasets
    from openghg.util import clean_string, load_json, timestamp_tzaware

    site_info = load_json(filename="acrg_site_info.json")
    site = site.upper()

    if site not in site_info:
        raise ValueError(
            f"No site called {site}, please enter a valid site name.")

    # Find the correct synonym for the passed species
    species = clean_string(_synonyms(species))

    # Get the observation data
    obs_results = search(
        site=site,
        species=species,
        inlet=inlet,
        start_date=start_date,
        end_date=end_date,
        instrument=instrument,
        find_all=True,
        skip_ranking=skip_ranking,
    )

    if not obs_results:
        raise ValueError(f"Unable to find results for {species} at {site}")

    # TODO - for some reason mypy doesn't pick up the ObsData being returned here, look into this
    # GJ - 2021-07-19
    retrieved_data: ObsData = obs_results.retrieve(site=site,
                                                   species=species,
                                                   inlet=inlet)  # type: ignore
    data = retrieved_data.data

    if data.attrs["inlet"] == "multiple":
        data.attrs["inlet_height_magl"] = "multiple"
        retrieved_data.metadata["inlet"] = "multiple"

    if start_date is not None and end_date is not None:
        start_date_tzaware = timestamp_tzaware(start_date)
        end_date_tzaware = timestamp_tzaware(end_date)
        end_date_tzaware_exclusive = end_date_tzaware - Timedelta(
            1, unit="nanosecond"
        )  # Deduct 1 ns to make the end day (date) exclusive.

        # Slice the data to only cover the dates we're interested in
        data = data.sel(
            time=slice(start_date_tzaware, end_date_tzaware_exclusive))

    try:
        start_date_data = timestamp_tzaware(data.time[0].values)
        end_date_data = timestamp_tzaware(data.time[-1].values)
    except AttributeError:
        raise AttributeError(
            "This dataset does not have a time attribute, unable to read date range"
        )

    if average is not None:
        # GJ - 2021-03-09
        # TODO - check by RT

        # # Average the Dataset over a given period
        # if keep_missing is True:
        #     # Create a dataset with one element and NaNs to prepend or append
        #     ds_single_element = data[{"time": 0}]

        #     for v in ds_single_element.variables:
        #         if v != "time":
        #             ds_single_element[v].values = np.nan

        #     ds_concat = []

        #     # Pad with an empty entry at the start date
        #     if timestamp_tzaware(data.time.min()) > start_date:
        #         ds_single_element_start = ds_single_element.copy()
        #         ds_single_element_start.time.values = Timestamp(start_date)
        #         ds_concat.append(ds_single_element_start)

        #     ds_concat.append(data)

        #     # Pad with an empty entry at the end date
        #     if data.time.max() < Timestamp(end_date):
        #         ds_single_element_end = ds_single_element.copy()
        #         ds_single_element_end.time.values = Timestamp(end_date) - Timedelta("1ns")
        #         ds_concat.append(ds_single_element_end)

        #     data = xr_concat(ds_concat, dim="time")

        #     # Now sort to get everything in the right order
        #     data = data.sortby("time")

        # First do a mean resample on all variables
        ds_resampled = data.resample(time=average).mean(skipna=False,
                                                        keep_attrs=True)
        # keep_attrs doesn't seem to work for some reason, so manually copy
        ds_resampled.attrs = data.attrs.copy()

        average_in_seconds = Timedelta(average).total_seconds()
        ds_resampled.attrs["averaged_period"] = average_in_seconds
        ds_resampled.attrs["averaged_period_str"] = average

        # For some variables, need a different type of resampling
        data_variables: List[str] = [str(v) for v in data.variables]

        for var in data_variables:
            if "repeatability" in var:
                ds_resampled[var] = (np.sqrt(
                    (data[var]**2).resample(time=average).sum()) /
                                     data[var].resample(time=average).count())

            # Copy over some attributes
            if "long_name" in data[var].attrs:
                ds_resampled[var].attrs["long_name"] = data[var].attrs[
                    "long_name"]

            if "units" in data[var].attrs:
                ds_resampled[var].attrs["units"] = data[var].attrs["units"]

        # Create a new variability variable, containing the standard deviation within the resampling period
        ds_resampled[f"{species}_variability"] = (data[species].resample(
            time=average).std(skipna=False, keep_attrs=True))

        # If there are any periods where only one measurement was resampled, just use the median variability
        ds_resampled[f"{species}_variability"][
            ds_resampled[f"{species}_variability"] ==
            0.0] = ds_resampled[f"{species}_variability"].median()

        # Create attributes for variability variable
        ds_resampled[f"{species}_variability"].attrs[
            "long_name"] = f"{data.attrs['long_name']}_variability"

        ds_resampled[f"{species}_variability"].attrs["units"] = data[
            species].attrs["units"]

        # Resampling may introduce NaNs, so remove, if not keep_missing
        if keep_missing is False:
            ds_resampled = ds_resampled.dropna(dim="time")

        data = ds_resampled

    # Rename variables
    rename: Dict[str, str] = {}

    data_variables = [str(v) for v in data.variables]
    for var in data_variables:
        if var.lower() == species.lower():
            rename[var] = "mf"
        if "repeatability" in var:
            rename[var] = "mf_repeatability"
        if "variability" in var:
            rename[var] = "mf_variability"
        if "number_of_observations" in var:
            rename[var] = "mf_number_of_observations"
        if "status_flag" in var:
            rename[var] = "status_flag"
        if "integration_flag" in var:
            rename[var] = "integration_flag"

    data = data.rename_vars(rename)  # type: ignore

    data.attrs["species"] = species

    if "calibration_scale" in data.attrs:
        data.attrs["scale"] = data.attrs.pop("calibration_scale")

    if calibration_scale is not None:
        data = _scale_convert(data, species, calibration_scale)

    metadata = retrieved_data.metadata
    metadata.update(data.attrs)

    obs_data = ObsData(data=data, metadata=metadata)

    # It doesn't make sense to do this now as we've only got a single Dataset
    # # Now check if the units match for each of the observation Datasets
    # units = set((f.data.mf.attrs["units"] for f in obs_files))
    # scales = set((f.data.attrs["scale"] for f in obs_files))

    # if len(units) > 1:
    #     raise ValueError(
    #         f"Units do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}"
    #     )

    # if len(scales) > 1:
    #     print(
    #         f"Scales do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}"
    #     )
    #     print("Suggestion: set calibration_scale to convert scales")

    return obs_data
Esempio n. 9
0
def _read_data(
    data_filepath: Path,
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Read the datafile passed in and extract the data we require.

    Args:
        data_filepath: Path to file
        site: Three letter site code
        network: Network name
        inlet: Inlet height
        instrument: Instrument name
        sampling_period: Sampling period including the unit (using pandas frequency aliases like '1H' or '1min')
        measurement_type: Measurement type e.g. insitu, flask
    Returns:
        dict: Dictionary of gas data
    """
    from datetime import datetime
    from pandas import RangeIndex, read_csv, NaT
    import warnings
    from openghg.util import clean_string

    split_fname = data_filepath.stem.split(".")
    site = site.lower()

    try:
        site_fname = clean_string(split_fname[0])
        inlet_fname = clean_string(split_fname[3])
    except IndexError:
        raise ValueError(
            "Error reading metadata from filename, we expect a form hfd.picarro.1minute.100m.dat"
        )

    if site_fname != site:
        raise ValueError(
            "Site mismatch between passed site code and that read from filename."
        )

    if "m" not in inlet_fname:
        raise ValueError(
            "No inlet found, we expect filenames such as: bsd.picarro.1minute.108m.dat"
        )

    if inlet is not None and inlet != inlet_fname:
        raise ValueError(
            "Inlet mismatch between passed inlet and that read from filename.")
    else:
        inlet = inlet_fname

    # Function to parse the datetime format found in the datafile
    def parse_date(date: str):  # type: ignore
        try:
            return datetime.strptime(date, "%y%m%d %H%M%S")
        except ValueError:
            return NaT

    # Catch dtype warnings
    # TODO - look at setting dtypes - read header and data separately?
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = read_csv(
            data_filepath,
            header=None,
            skiprows=1,
            sep=r"\s+",
            index_col=["0_1"],
            parse_dates=[[0, 1]],
            date_parser=parse_date,
        )

    data.index.name = "time"

    # Drop any rows with NaNs
    # This is now done before creating metadata
    data = data.dropna(axis="rows", how="any")

    # Get the number of gases in dataframe and number of columns of data present for each gas
    n_gases, n_cols = _gas_info(data=data)

    header = data.head(2)
    skip_cols = sum([header[column][0] == "-" for column in header.columns])

    metadata = _read_metadata(filepath=data_filepath, data=data)

    if network is not None:
        metadata["network"] = network

    if sampling_period is not None:
        # Compare against value extracted from the file name
        file_sampling_period = Timedelta(seconds=metadata["sampling_period"])

        comparison_seconds = abs(sampling_period -
                                 file_sampling_period).total_seconds()
        tolerance_seconds = 1

        if comparison_seconds > tolerance_seconds:
            raise ValueError(
                f"Input sampling period {sampling_period} does not match to value "
                f"extracted from the file name of {metadata['sampling_period']} seconds."
            )

    # Read the scale from JSON
    # I'll leave this here for the possible future movement from class to functions
    network_metadata = load_json(filename="process_gcwerks_parameters.json")
    crds_metadata = network_metadata["CRDS"]

    # This dictionary is used to store the gas data and its associated metadata
    combined_data = {}

    for n in range(n_gases):
        # Slice the columns
        gas_data = data.iloc[:, skip_cols + n * n_cols:skip_cols +
                             (n + 1) * n_cols]

        # Reset the column numbers
        gas_data.columns = RangeIndex(gas_data.columns.size)
        species = gas_data[0][0]
        species = species.lower()

        column_labels = [
            species,
            f"{species}_variability",
            f"{species}_number_of_observations",
        ]

        # Name columns
        gas_data = gas_data.set_axis(column_labels,
                                     axis="columns",
                                     inplace=False)

        header_rows = 2
        # Drop the first two rows now we have the name
        gas_data = gas_data.drop(index=gas_data.head(header_rows).index,
                                 inplace=False)
        # Cast data to float64 / double
        gas_data = gas_data.astype("float64")

        # Here we can convert the Dataframe to a Dataset and then write the attributes
        gas_data = gas_data.to_xarray()

        site_attributes = _get_site_attributes(site=site,
                                               inlet=inlet,
                                               crds_metadata=crds_metadata)

        scale = crds_metadata["default_scales"].get(species.upper(), "NA")

        # Create a copy of the metadata dict
        species_metadata = metadata.copy()
        species_metadata["species"] = clean_string(species)
        species_metadata["inlet"] = inlet
        species_metadata["scale"] = scale
        species_metadata["long_name"] = site_attributes["long_name"]

        combined_data[species] = {
            "metadata": species_metadata,
            "data": gas_data,
            "attributes": site_attributes,
        }

    return combined_data
Esempio n. 10
0
def parse_gcwerks(
    data_filepath: Union[str, Path],
    precision_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Reads a GC data file by creating a GC object and associated datasources

    Args:
        data_filepath: Path of data file
        precision_filepath: Path of precision file
        site: Three letter code or name for site
        instrument: Instrument name
        network: Network name
    Returns:
        dict: Dictionary of source_name : UUIDs
    """
    from pathlib import Path
    from openghg.standardise.meta import assign_attributes
    from openghg.util import clean_string, load_json

    data_filepath = Path(data_filepath)
    precision_filepath = Path(precision_filepath)

    # Do some setup for processing
    # Load site data
    gcwerks_data = load_json(filename="process_gcwerks_parameters.json")
    gc_params = gcwerks_data["GCWERKS"]

    network = clean_string(network)
    # We don't currently do anything with inlet here as it's always read from data
    # or taken from process_gcwerks_parameters.json
    if inlet is not None:
        inlet = clean_string(inlet)
    if instrument is not None:
        instrument = clean_string(instrument)

    # Check if the site code passed matches that read from the filename
    site = _check_site(
        filepath=data_filepath,
        site_code=site,
        gc_params=gc_params,
    )

    # If we're not passed the instrument name and we can't find it raise an error
    if instrument is None:
        instrument = _check_instrument(filepath=data_filepath,
                                       gc_params=gc_params,
                                       should_raise=True)
    else:
        fname_instrument = _check_instrument(filepath=data_filepath,
                                             gc_params=gc_params,
                                             should_raise=False)

        if fname_instrument is not None and instrument != fname_instrument:
            raise ValueError(
                f"Mismatch between instrument passed as argument {instrument} and instrument read from filename {fname_instrument}"
            )

    instrument = str(instrument)

    gas_data = _read_data(
        data_filepath=data_filepath,
        precision_filepath=precision_filepath,
        site=site,
        instrument=instrument,
        network=network,
        sampling_period=sampling_period,
        gc_params=gc_params,
    )

    # Assign attributes to the data for CF compliant NetCDFs
    gas_data = assign_attributes(data=gas_data, site=site)

    return gas_data
Esempio n. 11
0
def get_attributes(
    ds: Dataset,
    species: str,
    site: str,
    network: str = None,
    global_attributes: Dict[str, str] = None,
    units: str = None,
    scale: str = None,
    sampling_period: str = None,
    date_range: List[str] = None,
) -> Dataset:
    """
    This function writes attributes to an xarray.Dataset so that they conform with
    the CF Convention v1.6

    Attributes of the xarray DataSet are modified, and variable names are changed

    If the species is a standard mole fraction then either:
        - species name will used in lower case in the file and variable names
            but with any hyphens taken out
        - name will be changed according to the species_translator dictionary

    If the species is isotopic data or a non-standard variable (e.g. APO):
        - Isotopes species names should begin with a "D"
            (Annoyingly, the code currently picks up "Desflurane" too. I've
             fixed this for now, but if we get a lot of other "D" species, we
             should make this better)
        - I suggest naming for isotopologues should be d<species><isotope>, e.g.
            dCH4C13, or dCO2C14
        - Any non-standard variables should be listed in the species_translator
            dictionary

    Args:
        ds: Should contain variables such as "ch4", "ch4 repeatability".
            Must have a "time" dimension.
        species: Species name. e.g. "CH4", "HFC-134a", "dCH4C13"
        site: Three-letter site code
        network: Network site is associated with
        global_attribuates: Dictionary containing any info you want to
            add to the file header (e.g. {"Contact": "Contact_Name"})
        units: This routine will try to guess the units
            unless this is specified. Options are in units_interpret
        scale: Calibration scale for species.
        sampling_period: Number of seconds for which air
            sample is taken. Only for time variable attribute
        date_range: Start and end date for output
            If you only want an end date, just put a very early start date
            (e.g. ["1900-01-01", "2010-01-01"])
    """
    from pandas import Timestamp as pd_Timestamp
    from openghg.util import clean_string, load_json, timestamp_now

    # from numpy import unique as np_unique

    if not isinstance(ds, Dataset):
        raise TypeError("This function only accepts xarray Datasets")

    # Current CF Conventions (v1.7) demand that valid variable names
    # begin with a letter and be composed of letters, digits and underscores
    # Here variable names are also made lowercase to enable easier matching below

    # TODO - could I just cast ds.variables as as type for mypy instead of doing this?
    # variable_names = [str(v) for v in ds.variables]
    # Is this better?
    variable_names = cast(Dict[str, Any], ds.variables)
    to_underscores = {var: var.lower().replace(" ", "_") for var in variable_names}
    ds = ds.rename(to_underscores)  # type: ignore

    species_attrs = load_json(filename="species_attributes.json")
    attributes_data = load_json("attributes.json")

    species_translator = attributes_data["species_translation"]
    unit_species = attributes_data["unit_species"]
    unit_species_long = attributes_data["unit_species_long"]
    unit_interpret = attributes_data["unit_interpret"]

    species_upper = species.upper()
    species_lower = species.lower()

    variable_names = cast(Dict[str, Any], ds.variables)
    matched_keys = [var for var in variable_names if species_lower in var]

    # If we don't have any variables to rename, raise an error
    if not matched_keys:
        raise NameError(f"Cannot find species {species} in Dataset variables")

    species_rename = {}
    for var in matched_keys:
        try:
            species_label = species_translator[species_upper]["chem"]
        except KeyError:
            species_label = clean_string(species_lower)

        species_rename[var] = var.replace(species_lower, species_label)

    ds = ds.rename(species_rename)  # type: ignore

    # Global attributes
    global_attributes_default = {
        "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.",
        "source": "In situ measurements of air",
        "Conventions": "CF-1.6",
    }

    if global_attributes is not None:
        # TODO - for some reason mypy doesn't see a Dict[str,str] as a valid Mapping[Hashable, Any] type
        global_attributes.update(global_attributes_default)  # type: ignore
    else:
        global_attributes = global_attributes_default

    global_attributes["file_created"] = str(timestamp_now())
    global_attributes["processed_by"] = "OpenGHG_Cloud"
    global_attributes["species"] = species_label

    if scale is None:
        global_attributes["calibration_scale"] = "unknown"
    else:
        global_attributes["calibration_scale"] = scale

    # Update the Dataset attributes
    ds.attrs.update(global_attributes)  # type: ignore

    # Add some site attributes
    site_attributes = _site_info_attributes(site.upper(), network)
    ds.attrs.update(site_attributes)

    # Species-specific attributes
    # Long name
    if species_upper.startswith("D") and species_upper != "DESFLURANE" or species_upper == "APD":
        sp_long = species_translator[species_upper]["name"]
    elif species_upper == "RN":
        sp_long = "radioactivity_concentration_of_222Rn_in_air"
    elif species_upper in species_translator:
        name = species_translator[species_upper]["name"]
        sp_long = f"mole_fraction_of_{name}_in_air"
    else:
        sp_long = f"mole_fraction_of_{species_label}_in_air"

    ancillary_variables = []

    variable_names = cast(Dict[str, Any], ds.variables)
    matched_keys = [var for var in variable_names if species_lower in var.lower()]

    # Write units as attributes to variables containing any of these
    match_words = ["variability", "repeatability", "stdev", "count"]

    for key in variable_names:
        key = key.lower()

        if species_label.lower() in key:
            # Standard name attribute
            # ds[key].attrs["standard_name"]=key.replace(species_label, sp_long)
            ds[key].attrs["long_name"] = key.replace(species_label, sp_long)

            # If units are required for variable, add attribute
            if key == species_label or any(word in key for word in match_words):
                if units is not None:
                    if units in unit_interpret:
                        ds[key].attrs["units"] = unit_interpret[units]
                    else:
                        ds[key].attrs["units"] = unit_interpret["else"]
                else:
                    # TODO - merge these species attributes into a single simpler JSON
                    try:
                        ds[key].attrs["units"] = unit_species[species_upper]
                    except KeyError:
                        try:
                            ds[key].attrs["units"] = species_attrs[species_label.upper()]["units"]
                        except KeyError:
                            ds[key].attrs["units"] = "NA"

                # If units are non-standard, add explanation
                if species_upper in unit_species_long:
                    ds[key].attrs["units_description"] = unit_species_long[species_upper]

            # Add to list of ancilliary variables
            if key != species_label:
                ancillary_variables.append(key)

    # TODO - for the moment skip this step - check status of ancilliary variables in standard
    # Write ancilliary variable list
    # ds[species_label].attrs["ancilliary_variables"] = ", ".join(ancillary_variables)

    # Add quality flag attributes
    # NOTE - I've removed the whitespace before status_flag and integration_flag here
    variable_names = cast(Dict[str, Any], ds.variables)
    quality_flags = [key for key in variable_names if "status_flag" in key]

    # Not getting long_name for c2f6

    for key in quality_flags:
        ds[key] = ds[key].astype(int)
        try:
            long_name = ds[species_label].attrs["long_name"]
        except KeyError:
            raise KeyError(key, quality_flags)

        ds[key].attrs = {
            "flag_meaning": "0 = unflagged, 1 = flagged",
            "long_name": f"{long_name} status_flag",
        }

    variable_names = cast(Dict[str, Any], ds.variables)
    # Add integration flag attributes
    integration_flags = [key for key in variable_names if "integration_flag" in key]

    for key in integration_flags:
        ds[key] = ds[key].astype(int)
        long_name = ds[species_label].attrs["long_name"]
        ds[key].attrs = {
            "flag_meaning": "0 = area, 1 = height",
            "standard_name": f"{long_name} integration_flag",
            "comment": "GC peak integration method (by height or by area). Does not indicate data quality",
        }

    # Set time encoding
    # Check if there are duplicate time stamps

    # I feel there should be a more pandas way of doing this
    # but xarray doesn't currently have a duplicates method
    # See this https://github.com/pydata/xarray/issues/2108

    # if len(set(ds.time.values)) < len(ds.time.values):
    # if len(np_unique(ds.time.values)) < len(ds.time.values):
    #     print("WARNING. Duplicate time stamps")
    first_year = pd_Timestamp(str(ds.time[0].values)).year

    ds.time.encoding = {"units": f"seconds since {str(first_year)}-01-01 00:00:00"}

    time_attributes: Dict[str, str] = {}
    time_attributes["label"] = "left"
    time_attributes["standard_name"] = "time"
    time_attributes["comment"] = (
        "Time stamp corresponds to beginning of sampling period. "
        + "Time since midnight UTC of reference date. "
        + "Note that sampling periods are approximate."
    )

    if sampling_period is not None:
        time_attributes["sampling_period_seconds"] = sampling_period

    ds.time.attrs.update(time_attributes)

    # If a date range is specified, slice dataset
    if date_range:
        ds = ds.loc[dict(time=slice(*date_range))]

    return ds
Esempio n. 12
0
def _read_raw_data(
    data_filepath: Path,
    species: str,
    inlet: str,
    sampling_period: str,
    measurement_type: str = "flask",
) -> Dict:
    """Separates the gases stored in the dataframe in
    separate dataframes and returns a dictionary of gases
    with an assigned UUID as gas:UUID and a list of the processed
    dataframes

    Args:
        data_filepath: Path of datafile
        species: Species string such as CH4, CO
        measurement_type: Type of measurements e.g. flask
    Returns:
        dict: Dictionary containing attributes, data and metadata keys
    """
    from openghg.util import clean_string, read_header, load_json
    from pandas import read_csv, Timestamp

    header = read_header(filepath=data_filepath)

    column_names = header[-1][14:].split()

    def date_parser(year: str, month: str, day: str, hour: str, minute: str,
                    second: str) -> Timestamp:
        return Timestamp(year, month, day, hour, minute, second)

    date_parsing = {
        "time": [
            "sample_year",
            "sample_month",
            "sample_day",
            "sample_hour",
            "sample_minute",
            "sample_seconds",
        ]
    }

    data_types = {
        "sample_year": int,
        "sample_month": int,
        "sample_day": int,
        "sample_hour": int,
        "sample_minute": int,
        "sample_seconds": int,
    }

    # Number of header lines to skip
    n_skip = len(header)

    data = read_csv(
        data_filepath,
        skiprows=n_skip,
        names=column_names,
        sep=r"\s+",
        dtype=data_types,
        parse_dates=date_parsing,
        date_parser=date_parser,
        index_col="time",
        skipinitialspace=True,
    )

    # Drop duplicates
    data = data.loc[~data.index.duplicated(keep="first")]

    # Check if the index is sorted
    if not data.index.is_monotonic_increasing:
        data = data.sort_index()

    # Read the site code from the Dataframe
    site = str(data["sample_site_code"][0]).upper()

    site_data = load_json("acrg_site_info.json")
    # If this isn't a site we recognize try and read it from the filename
    if site not in site_data:
        site = str(data_filepath.name).split("_")[1].upper()

        if site not in site_data:
            raise ValueError(f"The site {site} is not recognized.")

    if species is not None:
        # If we're passed a species ensure that it is in fact the correct species
        data_species = str(data["parameter_formula"].values[0]).lower()

        passed_species = species.lower()
        if data_species != passed_species:
            raise ValueError(
                f"Mismatch between passed species ({passed_species}) and species read from data ({data_species})"
            )

    species = species.upper()

    flag = []
    selection_flag = []
    for flag_str in data.analysis_flag:
        flag.append(flag_str[0] == ".")
        selection_flag.append(int(flag_str[1] != "."))

    combined_data = {}

    data[species + "_status_flag"] = flag
    data[species + "_selection_flag"] = selection_flag

    data = data[data[species + "_status_flag"]]

    data = data[[
        "sample_latitude",
        "sample_longitude",
        "sample_altitude",
        "analysis_value",
        "analysis_uncertainty",
        species + "_selection_flag",
    ]]

    rename_dict = {
        "analysis_value": species,
        "analysis_uncertainty": species + "_repeatability",
        "sample_longitude": "longitude",
        "sample_latitude": "latitude",
        "sample_altitude": "altitude",
    }

    data = data.rename(columns=rename_dict, inplace=False)
    data = data.to_xarray()

    # TODO  - this could do with a better name
    noaa_params = load_json("attributes.json")["NOAA"]

    site_attributes = noaa_params["global_attributes"]
    site_attributes["inlet_height_magl"] = "NA"
    site_attributes["instrument"] = noaa_params["instrument"][species.upper()]
    site_attributes["sampling_period"] = sampling_period

    metadata = {}
    metadata["species"] = clean_string(species)
    metadata["site"] = site
    metadata["measurement_type"] = measurement_type
    metadata["network"] = "NOAA"
    metadata["inlet"] = inlet
    metadata["sampling_period"] = sampling_period
    metadata["instrument"] = noaa_params["instrument"][species.upper()]

    combined_data[species.lower()] = {
        "metadata": metadata,
        "data": data,
        "attributes": site_attributes,
    }

    return combined_data
Esempio n. 13
0
def parse_npl(
    data_filepath: pathType,
    site: str = "NPL",
    network: str = "LGHG",
    inlet: str = None,
    instrument: str = None,
    sampling_period: str = None,
    measurement_type: str = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        list: UUIDs of Datasources data has been assigned to
    """

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)

    site = "NPL"

    attributes_data = load_json(filename="attributes.json")
    npl_params = attributes_data["NPL"]

    # mypy doesn't like NaT or NaNs - look into this
    def parser(date: str):  # type: ignore
        try:
            return datetime.strptime(str(date), "%d/%m/%Y %H:%M")
        except ValueError:
            return NaT

    data = read_csv(data_filepath, index_col=0, date_parser=parser)

    # Drop the NaT/NaNs
    data = data.loc[data.index.dropna()]

    # Rename columns
    rename_dict = {"Cal_CO2_dry": "CO2", "Cal_CH4_dry": "CH4"}

    data = data.rename(columns=rename_dict)
    data.index.name = "time"

    if inlet is None:
        inlet = "NA"

    gas_data = {}
    for species in data.columns:
        processed_data = data.loc[:, [species]].sort_index().to_xarray()

        # Convert methane to ppb
        if species == "CH4":
            processed_data[species] *= 1000

        # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate
        # when averaging
        processed_data["{} variability".format(
            species)] = processed_data[species] * 0.0

        site_attributes = npl_params["global_attributes"]
        site_attributes["inlet_height_magl"] = npl_params["inlet"]
        site_attributes["instrument"] = npl_params["instrument"]

        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "NPL",
            "network": "LGHG",
            "inlet": inlet,
        }

        # TODO - add in better metadata reading
        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
Esempio n. 14
0
def search(**kwargs):  # type: ignore
    """Search for observations data. Any keyword arguments may be passed to the
    the function and these keywords will be used to search the metadata associated
    with each Datasource.

    Example / commonly used arguments are given below.

    Args:
        species: Terms to search for in Datasources
        locations: Where to search for the terms in species
        inlet: Inlet height such as 100m
        instrument: Instrument name such as picarro
        find_all: Require all search terms to be satisfied
        start_date: Start datetime for search.
        If None a start datetime of UNIX epoch (1970-01-01) is set
        end_date: End datetime for search.
        If None an end datetime of the current datetime is set
        skip_ranking: If True skip ranking system, defaults to False
    Returns:
        dict: List of keys of Datasources matching the search parameters
    """
    from addict import Dict as aDict
    from copy import deepcopy
    from itertools import chain as iter_chain

    from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel
    from openghg.store.base import Datasource

    from openghg.util import (
        timestamp_now,
        timestamp_epoch,
        timestamp_tzaware,
        clean_string,
        closest_daterange,
        find_daterange_gaps,
        split_daterange_str,
        load_json,
    )
    from openghg.dataobjects import SearchResults

    # Get a copy of kwargs as we make some modifications below
    kwargs_copy = deepcopy(kwargs)

    # Do this here otherwise we have to produce them for every datasource
    start_date = kwargs.get("start_date")
    end_date = kwargs.get("end_date")

    if start_date is None:
        start_date = timestamp_epoch()
    else:
        start_date = timestamp_tzaware(start_date)

    if end_date is None:
        end_date = timestamp_now()
    else:
        end_date = timestamp_tzaware(end_date)

    kwargs_copy["start_date"] = start_date
    kwargs_copy["end_date"] = end_date

    skip_ranking = kwargs_copy.get("skip_ranking", False)

    try:
        del kwargs_copy["skip_ranking"]
    except KeyError:
        pass

    # As we might have kwargs that are None we want to get rid of those
    search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None}

    # Speices translation

    species = search_kwargs.get("species")

    if species is not None:
        if not isinstance(species, list):
            species = [species]

        translator = load_json("species_translator.json")

        updated_species = []

        for s in species:
            updated_species.append(s)

            try:
                translated = translator[s]
            except KeyError:
                pass
            else:
                updated_species.extend(translated)

        search_kwargs["species"] = updated_species

    data_type = search_kwargs.get("data_type", "timeseries")

    valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model")
    if data_type not in valid_data_types:
        raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}")

    # Assume we want timeseries data
    obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load()

    if data_type == "footprints":
        obj = Footprints.load()
    elif data_type == "emissions":
        obj = Emissions.load()
    elif data_type == "eulerian_model":
        obj = EulerianModel.load()

    datasource_uuids = obj.datasources()

    # Shallow load the Datasources so we can search their metadata
    datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

    # For the time being this will return a dict until we know how best to represent
    # the footprints and emissions results in a SearchResult object
    if data_type in {"emissions", "footprints", "eulerian_model"}:
        sources: Dict = aDict()
        for datasource in datasources:
            if datasource.search_metadata(**search_kwargs):
                uid = datasource.uuid()
                sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)
                sources[uid]["metadata"] = datasource.metadata()

        return sources

    # Find the Datasources that contain matching metadata
    matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)}

    # TODO - Update this as it only uses the ACRG repo JSON at the moment
    # Check if this site only has one inlet, if so skip ranking
    # if "site" in search_kwargs:
    #     site = search_kwargs["site"]
    #     if not isinstance(site, list) and not multiple_inlets(site=site):
    #         skip_ranking = True

    # If there isn't *any* ranking data at all, skip all the ranking functionality
    if not obj._rank_data:
        skip_ranking = True

    # If only one datasource has been returned, skip all the ranking functionality
    if len(matching_sources) == 1:
        skip_ranking = True

    # If we have the site, inlet and instrument then just return the data
    # TODO - should instrument be added here
    if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True:
        specific_sources = aDict()
        for datasource in matching_sources.values():
            specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)

            if not specific_keys:
                continue

            metadata = datasource.metadata()

            site = metadata["site"]
            species = metadata["species"]
            inlet = metadata["inlet"]

            specific_sources[site][species][inlet]["keys"] = specific_keys
            specific_sources[site][species][inlet]["metadata"] = metadata

        return SearchResults(results=specific_sources.to_dict(), ranked_data=False)

    highest_ranked = aDict()

    for uid, datasource in matching_sources.items():
        # Find the site and then the ranking
        metadata = datasource.metadata()
        # Get the site inlet and species
        site = metadata["site"]
        species = metadata["species"]

        rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date)

        # If this Datasource doesn't have any ranking data skip it and move on
        if not rank_data:
            continue

        # There will only be a single rank key
        rank_value = next(iter(rank_data))
        # Get the daterange this rank covers
        rank_dateranges = rank_data[rank_value]

        # Each match we store gives us the information we need
        # to retrieve the data
        match = {"uuid": uid, "dateranges": rank_dateranges}

        # Need to ensure we get all the dates covered
        if species in highest_ranked[site]:
            species_rank_data = highest_ranked[site][species]

            # If we have a higher (lower number) rank save it
            if rank_value < species_rank_data["rank"]:
                species_rank_data["rank"] = rank_value
                species_rank_data["matching"] = [match]
            # If another Datasource has the same rank for another daterange
            # we want to save that as well
            elif rank_value == species_rank_data["rank"]:
                species_rank_data["matching"].append(match)
        else:
            highest_ranked[site][species]["rank"] = rank_value
            highest_ranked[site][species]["matching"] = [match]

    if not highest_ranked:
        raise ValueError(
            (
                "No ranking data set for the given search parameters."
                " Please refine your search to include a specific site, species and inlet."
            )
        )
    # Now we have the highest ranked data the dateranges there are ranks for
    # we want to fill in the gaps with (currently) the highest inlet from that site

    # We just want some rank_metadata to go along with the final data scheme
    # Can key a key of date - inlet
    data_keys: Dict = aDict()
    for site, species in highest_ranked.items():
        for sp, data in species.items():
            # data_keys[site][sp]["keys"] = []

            species_keys = []
            species_rank_data = {}
            species_metadata = {}

            for match_data in data["matching"]:
                uuid = match_data["uuid"]
                match_dateranges = match_data["dateranges"]
                # Get the datasource as it's already in the dictionary
                # we created earlier
                datasource = matching_sources[uuid]
                metadata = datasource.metadata()
                inlet = metadata["inlet"]

                keys = []
                for dr in match_dateranges:
                    date_keys = datasource.keys_in_daterange_str(daterange=dr)

                    if date_keys:
                        keys.extend(date_keys)
                        # We'll add this to the metadata in the search results we return at the end
                        species_rank_data[dr] = inlet

                species_keys.extend(keys)
                species_metadata[inlet] = metadata

            # Only create the dictionary keys if we have some data keys
            if species_keys:
                data_keys[site][sp]["keys"] = species_keys
                data_keys[site][sp]["rank_metadata"] = species_rank_data
                data_keys[site][sp]["metadata"] = species_metadata
            else:
                continue

            # We now need to retrieve data for the dateranges for which we don't have ranking data
            # To do this find the gaps in the daterange over which the user has requested data
            # and the dates for which we have ranking information

            # Get the dateranges that are covered by ranking information
            daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]]))
            # Find the gaps in the ranking coverage
            gap_dateranges = find_daterange_gaps(
                start_search=start_date, end_search=end_date, dateranges=daterange_strs
            )

            # We want the dateranges and inlets for those dateranges
            inlet_dateranges = data_keys[site][sp]["rank_metadata"]
            # These are the dateranges for which we have ranking information for this site and species
            ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys())

            for gap_daterange in gap_dateranges:
                # We want to select the inlet that's ranked for dates closest to the ones we have here
                closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges)

                gap_start, gap_end = split_daterange_str(gap_daterange)
                # Find the closest ranked inlet by date
                chosen_inlet = inlet_dateranges[closest_dr]

                inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet]
                inlet_instrument = inlet_metadata["instrument"]
                inlet_sampling_period = inlet_metadata["sampling_period"]

                # Then we want to retrieve the correct metadata for those inlets
                results: SearchResults = search(
                    site=site,
                    species=sp,
                    inlet=chosen_inlet,
                    instrument=inlet_instrument,
                    sampling_period=inlet_sampling_period,
                    start_date=gap_start,
                    end_date=gap_end,
                )  # type: ignore

                if not results:
                    continue

                # Retrieve the data keys
                inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet)

                data_keys[site][sp]["keys"].extend(inlet_data_keys)

            # Remove any duplicate keys
            data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"]))

    # TODO - create a stub for addict
    dict_data_keys = data_keys.to_dict()  # type: ignore

    return SearchResults(results=dict_data_keys, ranked_data=True)
Esempio n. 15
0
def parse_beaco2n(
    data_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: str,
    instrument: Optional[str] = "shinyei",
    sampling_period: Optional[str] = None,
) -> Dict:
    """Read BEACO2N data files

    Args:
        data_filepath: Data filepath
        site: Site name
        network: Network name
        inlet: Inlet height in metres
        instrument: Instrument name
        sampling_period: Measurement sampling period
    Returns:
        dict: Dictionary of data
    """
    import pandas as pd
    from openghg.util import load_json
    from collections import defaultdict
    from openghg.util import clean_string

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)
    datetime_columns = {"time": ["datetime"]}
    use_cols = [1, 5, 6, 7, 8, 9, 10]
    na_values = [-999.0]

    site = clean_string(site)

    try:
        data = pd.read_csv(
            data_filepath,
            index_col="time",
            usecols=use_cols,
            parse_dates=datetime_columns,
            na_values=na_values,
        )
    except ValueError as e:
        raise ValueError(
            f"Unable to read data file, please make sure it is in the standard BEACO2N format.\nError: {e}"
        )

    beaco2n_site_data = load_json("beaco2n_site_data.json")

    try:
        site_metadata = beaco2n_site_data[site.upper()]
    except KeyError:
        raise ValueError(f"Site {site} not recognized.")

    site_metadata["comment"] = "Retrieved from http://beacon.berkeley.edu/"

    # Check which columns we have in the data and build the rename dict
    possible_rename_cols = {
        "PM_ug/m3": "pm",
        "PM_ug/m3_QC_level": "pm_qc",
        "co2_ppm": "co2",
        "co2_ppm_QC_level": "co2_qc",
        "co_ppm": "co",
        "co_ppm_QC_level": "co_qc",
    }
    # Not all columns are in data from different sites, i.e. Glasgow has a CO column
    rename_cols = {k: v for k, v in possible_rename_cols.items() if k in data}
    # Set all values below zero to NaN
    data = data.rename(columns=rename_cols)

    # Read the columns available and make sure we have them to iterate over
    possible_measurement_types = ["pm", "co", "co2"]
    measurement_types = [c for c in possible_measurement_types if c in data]

    units = {"pm": "ug/m3", "co2": "ppm", "co": "ppm"}

    gas_data: DefaultDict[str, Dict[str, Union[DataFrame,
                                               Dict]]] = defaultdict(dict)
    for mt in measurement_types:
        m_data = data[[mt, f"{mt}_qc"]]
        m_data = m_data.dropna(axis="rows", subset=[mt])

        # Some sites don't have data for each type, skip that type if all NaNs
        if m_data.index.empty:
            continue

        m_data = m_data.to_xarray()

        species_metadata = {
            "units": units[mt],
            "site": site,
            "species": clean_string(mt),
            "inlet": clean_string(inlet),
            "network": "beaco2n",
            "sampling_period": str(sampling_period),
            "instrument": instrument,
        }

        gas_data[mt]["data"] = m_data
        gas_data[mt]["metadata"] = species_metadata
        gas_data[mt]["attributes"] = site_metadata

    # TODO - add CF Compliant attributes?

    return gas_data
Esempio n. 16
0
def parse_eurocom(
    data_filepath: Union[str, Path],
    site: str,
    sampling_period: str,
    network: Optional[str] = None,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dict:
    """Parses EUROCOM data files into a format expected by OpenGHG

    Args:
        data_filepath: Path of file to read
        site: Site code
        sampling_period: Sampling period in seconds
        network: Network name
        Inlet: Inlet height in metres
        Instrument: Instrument name
    Returns:
        dict: Dictionary of measurement data
    """
    from openghg.standardise.meta import assign_attributes, get_attributes
    from pandas import read_csv, Timestamp
    from openghg.util import read_header, load_json

    data_filepath = Path(data_filepath)

    if site is None:
        site = data_filepath.stem.split("_")[0]

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)

    filename = data_filepath.name
    inlet_height = filename.split("_")[1]

    if "m" not in inlet_height:
        inlet_height = "NA"

    # This dictionary is used to store the gas data and its associated metadata
    combined_data = {}

    # Read the header as lines starting with #
    header = read_header(data_filepath, comment_char="#")
    n_skip = len(header) - 1
    species = "co2"

    def date_parser(year: str, month: str, day: str, hour: str,
                    minute: str) -> Timestamp:
        return Timestamp(year=year,
                         month=month,
                         day=day,
                         hour=hour,
                         minute=minute)

    datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]}
    use_cols = [
        "Day",
        "Month",
        "Year",
        "Hour",
        "Minute",
        str(species.lower()),
        "SamplingHeight",
        "Stdev",
        "NbPoints",
    ]

    dtypes = {
        "Day": int,
        "Month": int,
        "Year": int,
        "Hour": int,
        "Minute": int,
        species.lower(): float,
        "Stdev": float,
        "SamplingHeight": float,
        "NbPoints": int,
    }

    data = read_csv(
        data_filepath,
        skiprows=n_skip,
        parse_dates=datetime_columns,
        date_parser=date_parser,
        index_col="time",
        sep=";",
        usecols=use_cols,
        dtype=dtypes,
        na_values="-999.99",
    )

    data = data[data[species.lower()] >= 0.0]
    data = data.dropna(axis="rows", how="any")
    # Drop duplicate indices
    data = data.loc[~data.index.duplicated(keep="first")]
    # Convert to xarray Dataset
    data = data.to_xarray()

    attributes_data = load_json(filename="attributes.json")
    eurocom_attributes = attributes_data["EUROCOM"]
    global_attributes = eurocom_attributes["global_attributes"]

    if inlet_height == "NA":
        try:
            inlet = eurocom_attributes["intake_height"][site]
            global_attributes["inlet_height_m"] = inlet
            calibration_scale = eurocom_attributes["calibration"][site]
        except KeyError:
            calibration_scale = {}
            raise ValueError(
                f"Unable to find inlet from filename or attributes file for {site}"
            )

    gas_data = get_attributes(
        ds=data,
        species=species,
        site=site,
        global_attributes=global_attributes,
        units="ppm",
    )

    # Create a copy of the metadata dict
    metadata = {}
    metadata["site"] = site
    metadata["species"] = species
    metadata["inlet_height"] = global_attributes["inlet_height_m"]
    metadata["calibration_scale"] = calibration_scale
    metadata["network"] = "EUROCOM"
    metadata["sampling_period"] = str(sampling_period)

    combined_data[species] = {
        "metadata": metadata,
        "data": gas_data,
        "attributes": global_attributes,
    }

    combined_data = assign_attributes(data=combined_data,
                                      site=site,
                                      sampling_period=sampling_period)

    return combined_data
Esempio n. 17
0
def _split_species(
    data: DataFrame,
    site: str,
    instrument: str,
    species: List,
    metadata: Dict,
    units: Dict,
    scale: Dict,
    gc_params: Dict,
) -> Dict:
    """Splits the species into separate dataframe into sections to be stored within individual Datasources

    Args:
        data: DataFrame of raw data
        site: Name of site from which this data originates
        instrument: Name of instrument
        species: List of species contained in data
        metadata: Dictionary of metadata
        units: Dictionary of units for each species
        scale: Dictionary of scales for each species
        gc_params: GCWERKS parameter dictionary
    Returns:
        dict: Dataframe of gas data and metadata
    """
    from addict import Dict as aDict
    from fnmatch import fnmatch
    from openghg.util import load_json, clean_string

    # Load species translator so we can keep species names consistent
    attributes_data = load_json("attributes.json")
    species_translator = attributes_data["species_translation"]

    # Read inlets from the parameters
    expected_inlets = _get_inlets(site_code=site, gc_params=gc_params)

    try:
        data_inlets = data["Inlet"].unique().tolist()
    except KeyError:
        raise KeyError(
            "Unable to read inlets from data, please ensure this data is of the GC type expected by this retrieve module"
        )

    combined_data = aDict()

    for spec in species:
        # Skip this species if the data is all NaNs
        if data[spec].isnull().all():
            continue

        # Here inlet is the inlet in the data and inlet_label is the label we want to use as metadata
        for inlet, inlet_label in expected_inlets.items():
            # Create a copy of metadata for local modification
            spec_metadata = metadata.copy()
            spec_metadata["units"] = units[spec]
            spec_metadata["scale"] = scale[spec]

            # If we've only got a single inlet
            if inlet == "any" or inlet == "air":
                spec_data = data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            elif "date" in inlet:
                dates = inlet.split("_")[1:]
                data_sliced = data.loc[dates[0]:dates[1]]

                spec_data = data_sliced[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            else:
                # Find the inlet
                matching_inlets = [i for i in data_inlets if fnmatch(i, inlet)]

                if not matching_inlets:
                    continue

                # Only set the label in metadata when we have the correct label
                spec_metadata["inlet"] = inlet_label
                # There should only be one matching label
                select_inlet = matching_inlets[0]
                # Take only data for this inlet from the dataframe
                inlet_data = data.loc[data["Inlet"] == select_inlet]

                spec_data = inlet_data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]

                spec_data = spec_data.dropna(axis="index", how="any")

            # Now we drop the inlet column
            spec_data = spec_data.drop("Inlet", axis="columns")

            # Check that the Dataframe has something in it
            if spec_data.empty:
                continue

            attributes = _get_site_attributes(site=site,
                                              inlet=inlet_label,
                                              instrument=instrument,
                                              gc_params=gc_params)
            attributes = attributes.copy()

            # We want an xarray Dataset
            spec_data = spec_data.to_xarray()

            # Create a standardised / cleaned species label
            try:
                comp_species = species_translator[spec.upper()]["chem"]
            except KeyError:
                comp_species = clean_string(spec.lower())

            # Add the cleaned species name to the metadata and alternative name if present
            spec_metadata["species"] = comp_species
            if comp_species != spec.lower() and comp_species != spec.upper():
                spec_metadata["species_alt"] = spec

            # Rename variables so they have lowercase and alphanumeric names
            to_rename = {}
            for var in spec_data.variables:
                if spec in var:
                    new_name = var.replace(spec, comp_species)
                    to_rename[var] = new_name

            spec_data = spec_data.rename(to_rename)

            # As a single species may have measurements from multiple inlets we
            # use the species and inlet as a key
            data_key = f"{comp_species}_{inlet_label}"

            combined_data[data_key]["metadata"] = spec_metadata
            combined_data[data_key]["data"] = spec_data
            combined_data[data_key]["attributes"] = attributes

    to_return: Dict = combined_data.to_dict()

    return to_return