Exemple #1
0
    def __init__(self) -> None:
        from openghg.util import timestamp_now
        from addict import Dict as aDict

        self._creation_datetime = timestamp_now()
        self._stored = False

        # Use an addict Dict here for easy nested data storage
        self._datasource_table = aDict()
        # Keyed by Datasource UUID
        self._datasource_uuids: Dict[str, str] = {}
        # Hashes of previously uploaded files
        self._file_hashes: Dict[str, str] = {}
        # Keyed by UUID
        self._rank_data = aDict()
Exemple #2
0
def to_dashboard_mobile(data: Dict,
                        filename: Union[str,
                                        Path] = None) -> Union[Dict, None]:
    """Export the Glasgow LICOR data to JSON for the dashboard

    Args:
        data: Data dictionary
        filename: Filename for export of JSON
    Returns:
        dict or None: Dictonary if no filename given
    """
    to_export = aDict()

    for species, species_data in data.items():
        spec_data = species_data["data"]
        metadata = species_data["metadata"]

        latitude = spec_data["latitude"].values.tolist()
        longitude = spec_data["longitude"].values.tolist()
        ch4 = spec_data["ch4"].values.tolist()

        to_export[species]["data"] = {
            "lat": latitude,
            "lon": longitude,
            "z": ch4
        }
        to_export[species]["metadata"] = metadata

    if filename is not None:
        with open(filename, "w") as f:
            dump(to_export, f)
        return None
    else:
        to_return: Dict = to_export.to_dict()
        return to_return
Exemple #3
0
def init_cfg(prgname, prgdir, libdir, dbg):
    """  This procedure returns a dictionary containing the important parts of
  configuration
  """
    #dbg.dprint(256, "in init_cfg")
    cfg = aDict()
    cfg.prgname = prgname
    cfg.prgdir = prgdir
    if prgdir not in sys.path:
        sys.path.insert(0, prgdir)
    files = [
        os.path.join(prgdir, prgname + '_imp.py'),
        os.path.join(prgdir, prgname + "_cfg.py"),
        os.path.join(prgdir, prgname + "_usg.py")
    ]
    for f in files:
        res = None
        if f.endswith("imp.py"):
            res = try_import_rx(f, cfg, 'imports')
        elif f.endswith("cfg.py"):
            res = try_import_rx(f,
                                cfg,
                                'data',
                                'argdefaults',
                                'guidefs',
                                tp='rx')
        elif f.endswith("usg.py"):
            res = try_import_rx(f, cfg, 'usage')

        if not isinstance(res, list):
            dbg.exitf(res, "in", f)
    return (cfg)
Exemple #4
0
def parse_glasow_picarro(
    data_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: str,
    instrument: str = "picarro",
    sampling_period: Optional[str] = None,
    measurement_type: str = "timeseries",
) -> Dict:
    """Read the Glasgow Science Tower Picarro data

    Args:
        data_filepath: Path to data file
    Returns:
        dict: Dictionary of processed data
    """
    warn(message=
         "Temporary function used to read Glasgow Science Tower Picarro data")

    df = pd.read_csv(data_filepath, index_col=[0], parse_dates=True)
    df = df.dropna(axis="rows", how="any")
    # We just want the concentration values for now
    species = ["co2", "ch4"]
    rename_cols = {f" {s}_C": s for s in species}
    df = df.rename(columns=rename_cols)

    site = "GST"
    long_site_name = "Glasgow Science Centre Tower"

    units = {"ch4": "ppb", "co2": "ppm"}

    if sampling_period is None:
        sampling_period = "NOT_SET"

    gas_data = aDict()
    for s in species:
        gas_data[s]["data"] = df[[s]].to_xarray()

        gas_data[s]["metadata"] = {
            "species": s,
            "long_name": long_site_name,
            "latitude": 55.859238,
            "longitude": -4.296180,
            "network": "npl_picarro",
            "inlet": "124m",
            "sampling_period": sampling_period,
            "site": site,
            "instrument": "picarro",
            "units": units[s],
        }

    # TODO - remove this once mypy stubs for addict are added
    to_return: Dict = gas_data.to_dict()

    return to_return
Exemple #5
0
    def rankings(self) -> Dict:
        if not self.ranked_data:
            print("No rank data")

        rank_result = aDict()

        for site, species_data in self.results.items():
            for species, data in species_data.items():
                rank_result[site][species] = data["rank_metadata"]

        to_return: Dict = rank_result.to_dict()

        return to_return
Exemple #6
0
    def from_data(cls: Type[T], data: Dict) -> T:
        """Create an object from data

        Args:
            data: JSON data
        Returns:
            cls: Class object of cls type
        """
        from openghg.util import timestamp_tzaware
        from addict import Dict as aDict

        if not data:
            raise ValueError("Unable to create object with empty dictionary")

        c = cls()
        c._creation_datetime = timestamp_tzaware(data["creation_datetime"])
        c._datasource_uuids = data["datasource_uuids"]
        c._file_hashes = data["file_hashes"]
        c._datasource_table = aDict(data["datasource_table"])
        c._rank_data = aDict(data["rank_data"])
        c._stored = False

        return c
Exemple #7
0
def parse_glasow_licor(filepath: Path,
                       sampling_period: Optional[str] = None) -> Dict:
    """Read the Glasgow LICOR data from NPL

    Args:
        filepath: Path to data file
    Returns:
        dict: Dictionary of data
    """
    date_index = {"time": ["DATE", "TIME"]}
    use_cols = [0, 1, 3, 4, 5]
    nan_values = [",,,"]
    df = read_csv(
        filepath,
        parse_dates=date_index,
        na_values=nan_values,
        infer_datetime_format=True,
        index_col="time",
        usecols=use_cols,
    )

    rename_cols = {
        "LAT": "latitude",
        "LON": "longitude",
        "Methane_Enhancement_Over_Background(ppb)": "ch4",
    }
    df = df.rename(columns=rename_cols).dropna(axis="rows", how="any")
    df.index = to_datetime(df.index)

    ds = df.to_xarray()

    if sampling_period is None:
        sampling_period = "NOT_SET"

    metadata = {
        "units": "ppb",
        "notes": "measurement value is methane enhancement over background",
        "sampling_period": sampling_period,
    }

    data = aDict()
    data["ch4"]["metadata"] = metadata
    data["ch4"]["data"] = ds

    to_return: Dict = data.to_dict()
    return to_return
Exemple #8
0
    def retrieve_all(self) -> Dict:
        """Retrieve all the data found during the serch

        Returns:
            dict: Dictionary of all data
        """
        data = aDict()

        # Can we just traverse the dict without looping?
        for site, species_data in self.results.items():
            for species, inlet_data in species_data.items():
                for inlet, keys in inlet_data.items():
                    data[site][species][inlet] = self._create_obsdata(site=site, species=species, inlet=inlet)

        # TODO - update this once addict is stubbed
        data_dict: Dict = data.to_dict()
        return data_dict
Exemple #9
0
def inittk():
    top = tk.Tk()
    top.option_add('*tearoff', 0)
    top.option_add('*tearOff', False)
    if pydevprog:
        dbg.entersub()
        cfg.widgets = aDict()
        cfg.widgets['MainWindow'] = top

    import mygui.themestuff as themestuff
    themes = themestuff.load_all_themes(top)

    if pydevprog:
        #dbg.dprint(4,"Available themes", themes)
        cfg.guidefs.available_themes = themes
        cfg.guidefs.loaded = True
        dbg.leavesub()

    return top
Exemple #10
0
def _parse_metadata(filepath: pathType) -> Dict:
    """Parse AQMesh metadata

    Args:
        filepath: Path to metadata CSV
        pipeline: If running in pipeline skip the writing of metadata to file
    Returns:
        dict: Dictionary of metadata
    """
    from addict import Dict as aDict
    from pandas import read_csv
    from openghg.util import check_date

    filepath = Path(filepath)
    raw_metadata = read_csv(filepath)

    site_metadata = aDict()

    for _, row in raw_metadata.iterrows():
        site_name = row["location_name"].replace(" ", "").lower()
        site_data = site_metadata[site_name]

        site_data["site"] = site_name
        site_data["pod_id"] = row["pod_id_location"]
        site_data["start_date"] = check_date(row["start_date_UTC"])
        site_data["end_date"] = check_date(row["end_date_UTC"])
        site_data["relocate_date"] = check_date(row["relocate_date_UTC"])
        site_data["long_name"] = row["location_name"]
        site_data["borough"] = row["Borough"]
        site_data["site_type"] = row["Type"]
        site_data["in_ulez"] = row["ULEZ"]
        site_data["latitude"] = row["Latitude"]
        site_data["longitude"] = row["Longitude"]
        site_data["inlet"] = row["Height"]
        site_data["network"] = "aqmesh_glasgow"
        site_data["sampling_period"] = "NA"

    # TODO - I feel this is a bit clunky
    dict_metadata: Dict = site_metadata.to_dict()
    return dict_metadata
Exemple #11
0
def parse_aqmesh(
    data_filepath: pathType,
    metadata_filepath: pathType,
    sampling_period: Optional[str] = None,
) -> Dict:
    """Read AQMesh data files

    Args:
        data_filepath: Data filepath
        metadata_filepath: Metadata filepath
        sampling_period: Measurement sampling period (str)
    Returns:
        dict: Dictionary of data
    """
    from addict import Dict as aDict
    from pandas import read_csv

    if sampling_period is None:
        sampling_period = "NOT_SET"

    use_cols = [0, 1, 4, 6]
    datetime_cols = {"time": ["date_UTC"]}
    na_values = [-999, -999.0]

    df = read_csv(
        data_filepath,
        index_col="time",
        usecols=use_cols,
        parse_dates=datetime_cols,
        na_values=na_values,
    )

    # This might change so we'll read it each time for now
    metadata = _parse_metadata(filepath=metadata_filepath)

    # Species is given in the data column
    orig_species = df.columns[0]
    species_split = orig_species.split("_")

    species = species_split[0]
    units = species_split[1]

    species_lower = species.lower()
    rename_cols = {orig_species: species_lower, "location_name": "site"}
    df = df.rename(columns=rename_cols)
    df = df.dropna(axis="rows", subset=[species_lower])

    # TODO - add in assignment of attributes
    # assign_attributes

    site_groups = df.groupby(df["site"])
    site_data = aDict()
    for site, site_df in site_groups:
        site_name = site.replace(" ", "").lower()
        site_df = site_df.drop("site", axis="columns")
        site_data[site_name]["data"] = site_df.to_xarray()
        site_data[site_name]["metadata"] = metadata[site_name]
        # Add in the species to the metadata
        site_data[site_name]["metadata"]["species"] = species_lower
        site_data[site_name]["metadata"]["units"] = units
        site_data[site_name]["metadata"]["sampling_period"] = sampling_period

    site_dict: Dict = site_data.to_dict()
    return site_dict
Exemple #12
0
def search(**kwargs):  # type: ignore
    """Search for observations data. Any keyword arguments may be passed to the
    the function and these keywords will be used to search the metadata associated
    with each Datasource.

    Example / commonly used arguments are given below.

    Args:
        species: Terms to search for in Datasources
        locations: Where to search for the terms in species
        inlet: Inlet height such as 100m
        instrument: Instrument name such as picarro
        find_all: Require all search terms to be satisfied
        start_date: Start datetime for search.
        If None a start datetime of UNIX epoch (1970-01-01) is set
        end_date: End datetime for search.
        If None an end datetime of the current datetime is set
        skip_ranking: If True skip ranking system, defaults to False
    Returns:
        dict: List of keys of Datasources matching the search parameters
    """
    from addict import Dict as aDict
    from copy import deepcopy
    from itertools import chain as iter_chain

    from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel
    from openghg.store.base import Datasource

    from openghg.util import (
        timestamp_now,
        timestamp_epoch,
        timestamp_tzaware,
        clean_string,
        closest_daterange,
        find_daterange_gaps,
        split_daterange_str,
        load_json,
    )
    from openghg.dataobjects import SearchResults

    # Get a copy of kwargs as we make some modifications below
    kwargs_copy = deepcopy(kwargs)

    # Do this here otherwise we have to produce them for every datasource
    start_date = kwargs.get("start_date")
    end_date = kwargs.get("end_date")

    if start_date is None:
        start_date = timestamp_epoch()
    else:
        start_date = timestamp_tzaware(start_date)

    if end_date is None:
        end_date = timestamp_now()
    else:
        end_date = timestamp_tzaware(end_date)

    kwargs_copy["start_date"] = start_date
    kwargs_copy["end_date"] = end_date

    skip_ranking = kwargs_copy.get("skip_ranking", False)

    try:
        del kwargs_copy["skip_ranking"]
    except KeyError:
        pass

    # As we might have kwargs that are None we want to get rid of those
    search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None}

    # Speices translation

    species = search_kwargs.get("species")

    if species is not None:
        if not isinstance(species, list):
            species = [species]

        translator = load_json("species_translator.json")

        updated_species = []

        for s in species:
            updated_species.append(s)

            try:
                translated = translator[s]
            except KeyError:
                pass
            else:
                updated_species.extend(translated)

        search_kwargs["species"] = updated_species

    data_type = search_kwargs.get("data_type", "timeseries")

    valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model")
    if data_type not in valid_data_types:
        raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}")

    # Assume we want timeseries data
    obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load()

    if data_type == "footprints":
        obj = Footprints.load()
    elif data_type == "emissions":
        obj = Emissions.load()
    elif data_type == "eulerian_model":
        obj = EulerianModel.load()

    datasource_uuids = obj.datasources()

    # Shallow load the Datasources so we can search their metadata
    datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

    # For the time being this will return a dict until we know how best to represent
    # the footprints and emissions results in a SearchResult object
    if data_type in {"emissions", "footprints", "eulerian_model"}:
        sources: Dict = aDict()
        for datasource in datasources:
            if datasource.search_metadata(**search_kwargs):
                uid = datasource.uuid()
                sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)
                sources[uid]["metadata"] = datasource.metadata()

        return sources

    # Find the Datasources that contain matching metadata
    matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)}

    # TODO - Update this as it only uses the ACRG repo JSON at the moment
    # Check if this site only has one inlet, if so skip ranking
    # if "site" in search_kwargs:
    #     site = search_kwargs["site"]
    #     if not isinstance(site, list) and not multiple_inlets(site=site):
    #         skip_ranking = True

    # If there isn't *any* ranking data at all, skip all the ranking functionality
    if not obj._rank_data:
        skip_ranking = True

    # If only one datasource has been returned, skip all the ranking functionality
    if len(matching_sources) == 1:
        skip_ranking = True

    # If we have the site, inlet and instrument then just return the data
    # TODO - should instrument be added here
    if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True:
        specific_sources = aDict()
        for datasource in matching_sources.values():
            specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)

            if not specific_keys:
                continue

            metadata = datasource.metadata()

            site = metadata["site"]
            species = metadata["species"]
            inlet = metadata["inlet"]

            specific_sources[site][species][inlet]["keys"] = specific_keys
            specific_sources[site][species][inlet]["metadata"] = metadata

        return SearchResults(results=specific_sources.to_dict(), ranked_data=False)

    highest_ranked = aDict()

    for uid, datasource in matching_sources.items():
        # Find the site and then the ranking
        metadata = datasource.metadata()
        # Get the site inlet and species
        site = metadata["site"]
        species = metadata["species"]

        rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date)

        # If this Datasource doesn't have any ranking data skip it and move on
        if not rank_data:
            continue

        # There will only be a single rank key
        rank_value = next(iter(rank_data))
        # Get the daterange this rank covers
        rank_dateranges = rank_data[rank_value]

        # Each match we store gives us the information we need
        # to retrieve the data
        match = {"uuid": uid, "dateranges": rank_dateranges}

        # Need to ensure we get all the dates covered
        if species in highest_ranked[site]:
            species_rank_data = highest_ranked[site][species]

            # If we have a higher (lower number) rank save it
            if rank_value < species_rank_data["rank"]:
                species_rank_data["rank"] = rank_value
                species_rank_data["matching"] = [match]
            # If another Datasource has the same rank for another daterange
            # we want to save that as well
            elif rank_value == species_rank_data["rank"]:
                species_rank_data["matching"].append(match)
        else:
            highest_ranked[site][species]["rank"] = rank_value
            highest_ranked[site][species]["matching"] = [match]

    if not highest_ranked:
        raise ValueError(
            (
                "No ranking data set for the given search parameters."
                " Please refine your search to include a specific site, species and inlet."
            )
        )
    # Now we have the highest ranked data the dateranges there are ranks for
    # we want to fill in the gaps with (currently) the highest inlet from that site

    # We just want some rank_metadata to go along with the final data scheme
    # Can key a key of date - inlet
    data_keys: Dict = aDict()
    for site, species in highest_ranked.items():
        for sp, data in species.items():
            # data_keys[site][sp]["keys"] = []

            species_keys = []
            species_rank_data = {}
            species_metadata = {}

            for match_data in data["matching"]:
                uuid = match_data["uuid"]
                match_dateranges = match_data["dateranges"]
                # Get the datasource as it's already in the dictionary
                # we created earlier
                datasource = matching_sources[uuid]
                metadata = datasource.metadata()
                inlet = metadata["inlet"]

                keys = []
                for dr in match_dateranges:
                    date_keys = datasource.keys_in_daterange_str(daterange=dr)

                    if date_keys:
                        keys.extend(date_keys)
                        # We'll add this to the metadata in the search results we return at the end
                        species_rank_data[dr] = inlet

                species_keys.extend(keys)
                species_metadata[inlet] = metadata

            # Only create the dictionary keys if we have some data keys
            if species_keys:
                data_keys[site][sp]["keys"] = species_keys
                data_keys[site][sp]["rank_metadata"] = species_rank_data
                data_keys[site][sp]["metadata"] = species_metadata
            else:
                continue

            # We now need to retrieve data for the dateranges for which we don't have ranking data
            # To do this find the gaps in the daterange over which the user has requested data
            # and the dates for which we have ranking information

            # Get the dateranges that are covered by ranking information
            daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]]))
            # Find the gaps in the ranking coverage
            gap_dateranges = find_daterange_gaps(
                start_search=start_date, end_search=end_date, dateranges=daterange_strs
            )

            # We want the dateranges and inlets for those dateranges
            inlet_dateranges = data_keys[site][sp]["rank_metadata"]
            # These are the dateranges for which we have ranking information for this site and species
            ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys())

            for gap_daterange in gap_dateranges:
                # We want to select the inlet that's ranked for dates closest to the ones we have here
                closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges)

                gap_start, gap_end = split_daterange_str(gap_daterange)
                # Find the closest ranked inlet by date
                chosen_inlet = inlet_dateranges[closest_dr]

                inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet]
                inlet_instrument = inlet_metadata["instrument"]
                inlet_sampling_period = inlet_metadata["sampling_period"]

                # Then we want to retrieve the correct metadata for those inlets
                results: SearchResults = search(
                    site=site,
                    species=sp,
                    inlet=chosen_inlet,
                    instrument=inlet_instrument,
                    sampling_period=inlet_sampling_period,
                    start_date=gap_start,
                    end_date=gap_end,
                )  # type: ignore

                if not results:
                    continue

                # Retrieve the data keys
                inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet)

                data_keys[site][sp]["keys"].extend(inlet_data_keys)

            # Remove any duplicate keys
            data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"]))

    # TODO - create a stub for addict
    dict_data_keys = data_keys.to_dict()  # type: ignore

    return SearchResults(results=dict_data_keys, ranked_data=True)
Exemple #13
0
def to_dashboard(data: Dict,
                 selected_vars: List,
                 downsample_n: int = 3,
                 filename: str = None) -> Union[Dict, None]:
    """Takes a Dataset produced by OpenGHG and outputs it into a JSON
    format readable by the OpenGHG dashboard or a related project.

    This also exports a separate file with the locations of the sites
    for use with map selector component.

    Note - this function does not currently support export of data from multiple
    inlets.

    Args:
        data: Dictionary of retrieved data
        selected_vars: The variables to want to export
        downsample_n: Take every nth value from the data
        filename: filename to write output to
    Returns:
        None
    """
    to_export = aDict()

    if not isinstance(selected_vars, list):
        selected_vars = [selected_vars]

    selected_vars = [str(c).lower() for c in selected_vars]

    for site, species_data in data.items():
        for species, inlet_data in species_data.items():
            measurement_data: ObsData
            for inlet, measurement_data in inlet_data.items():
                dataset = measurement_data.data
                metadata = measurement_data.metadata
                attributes = dataset.attrs

                df = dataset.to_dataframe()

                rename_lower = {c: str(c).lower() for c in df.columns}
                df = df.rename(columns=rename_lower)
                # We just want the selected variables
                to_extract = [c for c in df.columns if c in selected_vars]

                if not to_extract:
                    continue

                df = df[to_extract]

                # Downsample the data
                if downsample_n > 1:
                    df = df.iloc[::downsample_n]

                network = metadata["network"]
                instrument = metadata["instrument"]

                # TODO - remove this if we add site location to standard metadata
                location = {
                    "latitude": attributes["station_latitude"],
                    "longitude": attributes["station_longitude"],
                }
                metadata.update(location)

                json_data = loads(df.to_json())
                metadata = measurement_data.metadata

                to_export[species][network][site][inlet][instrument] = {
                    "data": json_data,
                    "metadata": metadata,
                }

    if filename is not None:
        with open(filename, "w") as f:
            dump(obj=to_export, fp=f)
        return None
    else:
        # TODO - remove this once addict is stubbed
        export_dict: Dict = to_export.to_dict()
        return export_dict
Exemple #14
0
def visualise_store() -> pyvis.network.Network:
    """View the object store using a pyvis force graph.

    This function should only be called from within a notebook

    Returns:
        pyvis.network.Network
    """
    from addict import Dict as aDict

    data = query_store()

    net = pyvis.network.Network("800px", "100%", notebook=True)
    net.force_atlas_2based()

    # Create the ObsSurface node
    net.add_node(0, label="Surface Observations", color="#4e79a7", value=5000)

    network_split = aDict()

    for key, value in data.items():
        # Iterate over Datasources to select the networks
        network = value["network"]
        site = value["site"]
        inlet = value["inlet"]
        network_split[network][site][inlet][key] = value

    for network, sites in network_split.items():
        network_name = network.upper()
        net.add_node(network, label=network_name, color="#59a14f", value=2500)
        net.add_edge(source=0, to=network)

        # Then we want a subnode for each site
        for site, site_data in sites.items():
            # Don't want to use a site here as a site might be in multiple networks
            site_name = site.upper()
            site_id = str(uuid4())
            net.add_node(site_id, label=site_name, color="#e15759", value=1000)
            net.add_edge(source=network, to=site_id)

            for inlet, inlet_data in site_data.items():
                inlet_name = str(inlet).lower()
                inlet_id = str(uuid4())
                net.add_node(n_id=inlet_id,
                             label=inlet_name,
                             color="#808080",
                             value=500)
                net.add_edge(source=site_id, to=inlet_id)

                # Now for each site create the datasource nodes
                for uid, datasource in inlet_data.items():
                    species = datasource["species"]
                    instrument = datasource["instrument"].upper()

                    label = f"{species.upper()} {instrument}"
                    title = "\n".join([
                        f"Site: {site.upper()}",
                        f"Species : {species.upper()}",
                        f"Instrument: {instrument}",
                    ])
                    net.add_node(n_id=uid,
                                 label=label,
                                 title=title,
                                 color="#f28e2b",
                                 value=100)
                    net.add_edge(source=inlet_id, to=uid)

    return net.show("openghg_objstore.html")
Exemple #15
0
def _split_species(
    data: DataFrame,
    site: str,
    instrument: str,
    species: List,
    metadata: Dict,
    units: Dict,
    scale: Dict,
    gc_params: Dict,
) -> Dict:
    """Splits the species into separate dataframe into sections to be stored within individual Datasources

    Args:
        data: DataFrame of raw data
        site: Name of site from which this data originates
        instrument: Name of instrument
        species: List of species contained in data
        metadata: Dictionary of metadata
        units: Dictionary of units for each species
        scale: Dictionary of scales for each species
        gc_params: GCWERKS parameter dictionary
    Returns:
        dict: Dataframe of gas data and metadata
    """
    from addict import Dict as aDict
    from fnmatch import fnmatch
    from openghg.util import load_json, clean_string

    # Load species translator so we can keep species names consistent
    attributes_data = load_json("attributes.json")
    species_translator = attributes_data["species_translation"]

    # Read inlets from the parameters
    expected_inlets = _get_inlets(site_code=site, gc_params=gc_params)

    try:
        data_inlets = data["Inlet"].unique().tolist()
    except KeyError:
        raise KeyError(
            "Unable to read inlets from data, please ensure this data is of the GC type expected by this retrieve module"
        )

    combined_data = aDict()

    for spec in species:
        # Skip this species if the data is all NaNs
        if data[spec].isnull().all():
            continue

        # Here inlet is the inlet in the data and inlet_label is the label we want to use as metadata
        for inlet, inlet_label in expected_inlets.items():
            # Create a copy of metadata for local modification
            spec_metadata = metadata.copy()
            spec_metadata["units"] = units[spec]
            spec_metadata["scale"] = scale[spec]

            # If we've only got a single inlet
            if inlet == "any" or inlet == "air":
                spec_data = data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            elif "date" in inlet:
                dates = inlet.split("_")[1:]
                data_sliced = data.loc[dates[0]:dates[1]]

                spec_data = data_sliced[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            else:
                # Find the inlet
                matching_inlets = [i for i in data_inlets if fnmatch(i, inlet)]

                if not matching_inlets:
                    continue

                # Only set the label in metadata when we have the correct label
                spec_metadata["inlet"] = inlet_label
                # There should only be one matching label
                select_inlet = matching_inlets[0]
                # Take only data for this inlet from the dataframe
                inlet_data = data.loc[data["Inlet"] == select_inlet]

                spec_data = inlet_data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]

                spec_data = spec_data.dropna(axis="index", how="any")

            # Now we drop the inlet column
            spec_data = spec_data.drop("Inlet", axis="columns")

            # Check that the Dataframe has something in it
            if spec_data.empty:
                continue

            attributes = _get_site_attributes(site=site,
                                              inlet=inlet_label,
                                              instrument=instrument,
                                              gc_params=gc_params)
            attributes = attributes.copy()

            # We want an xarray Dataset
            spec_data = spec_data.to_xarray()

            # Create a standardised / cleaned species label
            try:
                comp_species = species_translator[spec.upper()]["chem"]
            except KeyError:
                comp_species = clean_string(spec.lower())

            # Add the cleaned species name to the metadata and alternative name if present
            spec_metadata["species"] = comp_species
            if comp_species != spec.lower() and comp_species != spec.upper():
                spec_metadata["species_alt"] = spec

            # Rename variables so they have lowercase and alphanumeric names
            to_rename = {}
            for var in spec_data.variables:
                if spec in var:
                    new_name = var.replace(spec, comp_species)
                    to_rename[var] = new_name

            spec_data = spec_data.rename(to_rename)

            # As a single species may have measurements from multiple inlets we
            # use the species and inlet as a key
            data_key = f"{comp_species}_{inlet_label}"

            combined_data[data_key]["metadata"] = spec_metadata
            combined_data[data_key]["data"] = spec_data
            combined_data[data_key]["attributes"] = attributes

    to_return: Dict = combined_data.to_dict()

    return to_return