Exemple #1
0
def parse_crds(
    data_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Creates a CRDS object holding data stored within Datasources

    Args:
        data_filepath: Path to file
        site: Three letter site code
        network: Network name
        inlet: Inlet height
        instrument: Instrument name
        sampling_period: Sampling period e.g. 2 hour: 2H, 2 minute: 2m
        measurement_type: Measurement type e.g. insitu, flask
    Returns:
        dict: Dictionary of gas data
    """
    from pathlib import Path
    from openghg.standardise.meta import assign_attributes

    if not isinstance(data_filepath, Path):
        data_filepath = Path(data_filepath)

    # This may seem like an almost pointless function as this is all we do
    # but it makes it a lot easier to test assign_attributes
    gas_data = _read_data(
        data_filepath=data_filepath,
        site=site,
        network=network,
        inlet=inlet,
        instrument=instrument,
        sampling_period=sampling_period,
        measurement_type=measurement_type,
    )

    # Ensure the data is CF compliant
    gas_data = assign_attributes(data=gas_data,
                                 site=site,
                                 sampling_period=sampling_period)

    return gas_data
Exemple #2
0
def _read_raw_file(
    data_filepath: Union[str, Path],
    site: str,
    inlet: str,
    sampling_period: str,
    measurement_type: str,
    instrument: Optional[str] = None,
) -> Dict:
    """Reads NOAA data files and returns a dictionary of processed
    data and metadata.

    Args:
        data_filepath: Path of file to load
        species: Species name
        site: Site name
    Returns:
        list: UUIDs of Datasources data has been assigned to
    """
    from openghg.standardise.meta import assign_attributes
    from pathlib import Path

    data_filepath = Path(data_filepath)
    filename = data_filepath.name

    species = filename.split("_")[0].lower()

    source_name = data_filepath.stem
    source_name = source_name.split("-")[0]

    gas_data = _read_raw_data(
        data_filepath=data_filepath,
        inlet=inlet,
        species=species,
        measurement_type=measurement_type,
        sampling_period=sampling_period,
    )

    gas_data = assign_attributes(data=gas_data, site=site, network="NOAA")

    return gas_data
Exemple #3
0
def parse_eurocom(
    data_filepath: Union[str, Path],
    site: str,
    sampling_period: str,
    network: Optional[str] = None,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dict:
    """Parses EUROCOM data files into a format expected by OpenGHG

    Args:
        data_filepath: Path of file to read
        site: Site code
        sampling_period: Sampling period in seconds
        network: Network name
        Inlet: Inlet height in metres
        Instrument: Instrument name
    Returns:
        dict: Dictionary of measurement data
    """
    from openghg.standardise.meta import assign_attributes, get_attributes
    from pandas import read_csv, Timestamp
    from openghg.util import read_header, load_json

    data_filepath = Path(data_filepath)

    if site is None:
        site = data_filepath.stem.split("_")[0]

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)

    filename = data_filepath.name
    inlet_height = filename.split("_")[1]

    if "m" not in inlet_height:
        inlet_height = "NA"

    # This dictionary is used to store the gas data and its associated metadata
    combined_data = {}

    # Read the header as lines starting with #
    header = read_header(data_filepath, comment_char="#")
    n_skip = len(header) - 1
    species = "co2"

    def date_parser(year: str, month: str, day: str, hour: str,
                    minute: str) -> Timestamp:
        return Timestamp(year=year,
                         month=month,
                         day=day,
                         hour=hour,
                         minute=minute)

    datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]}
    use_cols = [
        "Day",
        "Month",
        "Year",
        "Hour",
        "Minute",
        str(species.lower()),
        "SamplingHeight",
        "Stdev",
        "NbPoints",
    ]

    dtypes = {
        "Day": int,
        "Month": int,
        "Year": int,
        "Hour": int,
        "Minute": int,
        species.lower(): float,
        "Stdev": float,
        "SamplingHeight": float,
        "NbPoints": int,
    }

    data = read_csv(
        data_filepath,
        skiprows=n_skip,
        parse_dates=datetime_columns,
        date_parser=date_parser,
        index_col="time",
        sep=";",
        usecols=use_cols,
        dtype=dtypes,
        na_values="-999.99",
    )

    data = data[data[species.lower()] >= 0.0]
    data = data.dropna(axis="rows", how="any")
    # Drop duplicate indices
    data = data.loc[~data.index.duplicated(keep="first")]
    # Convert to xarray Dataset
    data = data.to_xarray()

    attributes_data = load_json(filename="attributes.json")
    eurocom_attributes = attributes_data["EUROCOM"]
    global_attributes = eurocom_attributes["global_attributes"]

    if inlet_height == "NA":
        try:
            inlet = eurocom_attributes["intake_height"][site]
            global_attributes["inlet_height_m"] = inlet
            calibration_scale = eurocom_attributes["calibration"][site]
        except KeyError:
            calibration_scale = {}
            raise ValueError(
                f"Unable to find inlet from filename or attributes file for {site}"
            )

    gas_data = get_attributes(
        ds=data,
        species=species,
        site=site,
        global_attributes=global_attributes,
        units="ppm",
    )

    # Create a copy of the metadata dict
    metadata = {}
    metadata["site"] = site
    metadata["species"] = species
    metadata["inlet_height"] = global_attributes["inlet_height_m"]
    metadata["calibration_scale"] = calibration_scale
    metadata["network"] = "EUROCOM"
    metadata["sampling_period"] = str(sampling_period)

    combined_data[species] = {
        "metadata": metadata,
        "data": gas_data,
        "attributes": global_attributes,
    }

    combined_data = assign_attributes(data=combined_data,
                                      site=site,
                                      sampling_period=sampling_period)

    return combined_data
Exemple #4
0
def parse_btt(
    data_filepath: Union[str, Path],
    site: Optional[str] = "BTT",
    network: Optional[str] = "LGHG",
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        dict: Dictionary of gas data
    """
    from openghg.standardise.meta import assign_attributes
    from pandas import read_csv, Timestamp, to_timedelta, isnull
    from numpy import nan as np_nan
    from openghg.util import clean_string, load_json

    # TODO: Decide what to do about inputs which aren't use anywhere
    # at present - inlet, instrument, sampling_period, measurement_type

    data_filepath = Path(data_filepath)

    site = "BTT"

    # Rename these columns
    rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"}
    # We only want these species
    species_extract = ["CO2", "CH4"]
    # Take std-dev measurements from these columns for these species
    species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"}

    param_data = load_json(filename="attributes.json")
    network_params = param_data["BTT"]

    sampling_period = int(network_params["sampling_period"])
    sampling_period_seconds = str(sampling_period) + "s"

    data = read_csv(data_filepath)
    data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta(
        data["DOY"] - 1, unit="D")
    data["time"] = data["time"].dt.round(sampling_period_seconds)
    data = data[~isnull(data.time)]

    data = data.rename(columns=rename_dict)
    data = data.set_index("time")

    gas_data = {}
    for species in species_extract:
        processed_data = data.loc[:, [species]].sort_index()
        # Create a variability column
        species_stddev_label = species_sd[species]
        processed_data[species][f"{species} variability"] = data[
            species_stddev_label]

        # Replace any values below zero with NaNs
        processed_data[processed_data < 0] = np_nan
        # Drop NaNs
        processed_data = processed_data.dropna()
        # Convert to a Dataset
        processed_data = processed_data.to_xarray()

        site_attributes = network_params["global_attributes"]
        site_attributes["inlet_height_magl"] = network_params["inlet"]
        site_attributes["instrument"] = network_params["instrument"]
        site_attributes["sampling_period"] = sampling_period

        # TODO - add in better metadata reading
        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "BTT",
        }

        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
Exemple #5
0
def _read_obspack(
    data_filepath: Union[str, Path],
    site: str,
    inlet: str,
    sampling_period: str,
    measurement_type: str,
    instrument: Optional[str] = None,
) -> Dict[str, Dict]:
    """Read NOAA ObsPack NetCDF files

    Args:
        data_filepath: Path to file
        site: Three letter site code
        inlet: Inlet height, if no height use measurement type e.g. flask
        measurement_type: One of flask, insity or pfp
        instrument: Instrument name
        sampling_period: Sampling period
    Returns:
        dict: Dictionary of results
    """
    import xarray as xr
    from openghg.util import clean_string
    from openghg.standardise.meta import assign_attributes

    valid_types = ("flask", "insitu", "pfp")

    if measurement_type not in valid_types:
        raise ValueError(f"measurement_type must be one of {valid_types}")

    obspack_ds = xr.open_dataset(data_filepath)
    # orig_attrs = obspack_ds.attrs

    # Want to find and drop any duplicate time values for the original dataset
    # Using xarray directly we have to do in a slightly convoluted way as this is not well built
    # into the xarray workflow yet - https://github.com/pydata/xarray/pull/5239
    # - can use da.drop_duplicates() but only on one variable at a time and not on the whole Dataset
    # This method keeps attributes for each of the variables including units

    # The dimension within the original dataset is called "obs" and has no associated coordinates
    # Extract time from original Dataset (dimension is "obs")
    time = obspack_ds.time

    # To keep associated "obs" dimension, need to assign coordinate values to this (just 0, len(obs))
    time = time.assign_coords(obs=obspack_ds.obs)

    # Make "time" the primary dimension (while retaining "obs") and add "time" values as coordinates
    time = time.swap_dims(dims_dict={"obs": "time"})
    time = time.assign_coords(time=time)

    # Drop any duplicate time values and extract the associated "obs" values
    # TODO: Work out what to do with duplicates - may be genuine multiple measurements
    time_unique = time.drop_duplicates(dim="time", keep="first")
    obs_unique = time_unique.obs

    # Estimate sampling period using metadata and midpoint time
    if sampling_period == "NOT_SET":
        sampling_period_estimate = _estimate_sampling_period(obspack_ds)
    else:
        sampling_period_estimate = -1.0

    species = clean_string(obspack_ds.attrs["dataset_parameter"])
    network = "NOAA"

    # Use these obs values to filter the original dataset to remove any repeated times
    processed_ds = obspack_ds.sel(obs=obs_unique)
    processed_ds = processed_ds.set_coords(["time"])

    # Rename variables to match our internal standard
    # "value_std_dev" --> f"{species}_variability"
    # "value_unc" --> ??
    # TODO: Clarify what "value_unc" should be renamed to

    variable_names = {
        "value": species,
        "value_std_dev": f"{species}_variability",
        "value_unc": f"{species}_variability",  # May need to be updated
        "nvalue": f"{species}_number_of_observations"
    }

    to_extract = [
        name for name in variable_names.keys() if name in processed_ds
    ]
    name_dict = {
        name: key
        for name, key in variable_names.items() if name in to_extract
    }

    if not to_extract:
        wanted = variable_names.keys()
        raise ValueError(
            f"No valid data columns found in converted DataFrame. We expect the following data variables in the passed NetCDF: {wanted}"
        )

    processed_ds = processed_ds[to_extract]
    processed_ds = processed_ds.rename(name_dict)
    processed_ds = processed_ds.sortby("time")

    try:
        # Extract units attribute from value data variable
        units = processed_ds[species].units
    except (KeyError, AttributeError):
        print("Unable to extract units from 'value' within input dataset")
    else:
        if units == "mol mol-1":
            units = "1"
        elif units == "millimol mol-1":
            units = "1e-3"
        elif units == "micromol mol-1":
            units = "1e-6"
        elif units == "nmol mol-1":
            units = "1e-9"
        elif units == "pmol mol-1":
            units = "1e-12"
        else:
            print(f"Using unit {units} directly")
            # raise ValueError(f"Did not recognise input units from file: {units}")

    metadata = {}
    metadata["site"] = site
    metadata["inlet"] = inlet
    metadata["network"] = network
    metadata["measurement_type"] = measurement_type
    metadata["species"] = species
    metadata["units"] = units
    metadata["sampling_period"] = sampling_period

    if instrument is not None:
        metadata["instrument"] = instrument
    else:
        try:
            metadata["instrument"] = obspack_ds.attrs["instrument"]
        except KeyError:
            pass

    if sampling_period_estimate >= 0.0:
        metadata["sampling_period_estimate"] = str(
            sampling_period_estimate
        )  # convert to string to keep consistent with "sampling_period"

    # TODO: At the moment all attributes from the NOAA ObsPack are being copied
    # plus any variables we're adding.
    # - decide if we want to reduce this
    attributes = obspack_ds.attrs
    attributes["sampling_period"] = sampling_period
    if sampling_period_estimate >= 0.0:
        attributes["sampling_period_estimate"] = str(sampling_period_estimate)

    gas_data = {
        species: {
            "data": processed_ds,
            "metadata": metadata,
            "attributes": attributes
        }
    }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
Exemple #6
0
def parse_npl(
    data_filepath: pathType,
    site: str = "NPL",
    network: str = "LGHG",
    inlet: str = None,
    instrument: str = None,
    sampling_period: str = None,
    measurement_type: str = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        list: UUIDs of Datasources data has been assigned to
    """

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)

    site = "NPL"

    attributes_data = load_json(filename="attributes.json")
    npl_params = attributes_data["NPL"]

    # mypy doesn't like NaT or NaNs - look into this
    def parser(date: str):  # type: ignore
        try:
            return datetime.strptime(str(date), "%d/%m/%Y %H:%M")
        except ValueError:
            return NaT

    data = read_csv(data_filepath, index_col=0, date_parser=parser)

    # Drop the NaT/NaNs
    data = data.loc[data.index.dropna()]

    # Rename columns
    rename_dict = {"Cal_CO2_dry": "CO2", "Cal_CH4_dry": "CH4"}

    data = data.rename(columns=rename_dict)
    data.index.name = "time"

    if inlet is None:
        inlet = "NA"

    gas_data = {}
    for species in data.columns:
        processed_data = data.loc[:, [species]].sort_index().to_xarray()

        # Convert methane to ppb
        if species == "CH4":
            processed_data[species] *= 1000

        # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate
        # when averaging
        processed_data["{} variability".format(
            species)] = processed_data[species] * 0.0

        site_attributes = npl_params["global_attributes"]
        site_attributes["inlet_height_magl"] = npl_params["inlet"]
        site_attributes["instrument"] = npl_params["instrument"]

        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "NPL",
            "network": "LGHG",
            "inlet": inlet,
        }

        # TODO - add in better metadata reading
        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
Exemple #7
0
def parse_gcwerks(
    data_filepath: Union[str, Path],
    precision_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Reads a GC data file by creating a GC object and associated datasources

    Args:
        data_filepath: Path of data file
        precision_filepath: Path of precision file
        site: Three letter code or name for site
        instrument: Instrument name
        network: Network name
    Returns:
        dict: Dictionary of source_name : UUIDs
    """
    from pathlib import Path
    from openghg.standardise.meta import assign_attributes
    from openghg.util import clean_string, load_json

    data_filepath = Path(data_filepath)
    precision_filepath = Path(precision_filepath)

    # Do some setup for processing
    # Load site data
    gcwerks_data = load_json(filename="process_gcwerks_parameters.json")
    gc_params = gcwerks_data["GCWERKS"]

    network = clean_string(network)
    # We don't currently do anything with inlet here as it's always read from data
    # or taken from process_gcwerks_parameters.json
    if inlet is not None:
        inlet = clean_string(inlet)
    if instrument is not None:
        instrument = clean_string(instrument)

    # Check if the site code passed matches that read from the filename
    site = _check_site(
        filepath=data_filepath,
        site_code=site,
        gc_params=gc_params,
    )

    # If we're not passed the instrument name and we can't find it raise an error
    if instrument is None:
        instrument = _check_instrument(filepath=data_filepath,
                                       gc_params=gc_params,
                                       should_raise=True)
    else:
        fname_instrument = _check_instrument(filepath=data_filepath,
                                             gc_params=gc_params,
                                             should_raise=False)

        if fname_instrument is not None and instrument != fname_instrument:
            raise ValueError(
                f"Mismatch between instrument passed as argument {instrument} and instrument read from filename {fname_instrument}"
            )

    instrument = str(instrument)

    gas_data = _read_data(
        data_filepath=data_filepath,
        precision_filepath=precision_filepath,
        site=site,
        instrument=instrument,
        network=network,
        sampling_period=sampling_period,
        gc_params=gc_params,
    )

    # Assign attributes to the data for CF compliant NetCDFs
    gas_data = assign_attributes(data=gas_data, site=site)

    return gas_data