def __init__(self) -> None: from openghg.util import timestamp_now from addict import Dict as aDict self._creation_datetime = timestamp_now() self._stored = False # Use an addict Dict here for easy nested data storage self._datasource_table = aDict() # Keyed by Datasource UUID self._datasource_uuids: Dict[str, str] = {} # Hashes of previously uploaded files self._file_hashes: Dict[str, str] = {} # Keyed by UUID self._rank_data = aDict()
def __init__(self) -> None: from openghg.util import timestamp_now from collections import defaultdict from uuid import uuid4 self._uuid: str = str(uuid4()) self._creation_datetime = timestamp_now() self._metadata: Dict[str, str] = {} # Dictionary keyed by daterange of data in each Dataset self._data: Dict[str, Dataset] = {} self._start_date = None self._end_date = None self._stored = False # This dictionary stored the keys for each version of data uploaded # data_key = d._data_keys["latest"]["keys"][date_key] self._data_keys: dataKeyType = defaultdict(dict) self._data_type: str = "timeseries" # Hold information regarding the versions of the data # Currently unused self._latest_version: str = "latest" self._versions: Dict[str, List] = {}
def search(**kwargs): # type: ignore """Search for observations data. Any keyword arguments may be passed to the the function and these keywords will be used to search the metadata associated with each Datasource. Example / commonly used arguments are given below. Args: species: Terms to search for in Datasources locations: Where to search for the terms in species inlet: Inlet height such as 100m instrument: Instrument name such as picarro find_all: Require all search terms to be satisfied start_date: Start datetime for search. If None a start datetime of UNIX epoch (1970-01-01) is set end_date: End datetime for search. If None an end datetime of the current datetime is set skip_ranking: If True skip ranking system, defaults to False Returns: dict: List of keys of Datasources matching the search parameters """ from addict import Dict as aDict from copy import deepcopy from itertools import chain as iter_chain from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel from openghg.store.base import Datasource from openghg.util import ( timestamp_now, timestamp_epoch, timestamp_tzaware, clean_string, closest_daterange, find_daterange_gaps, split_daterange_str, load_json, ) from openghg.dataobjects import SearchResults # Get a copy of kwargs as we make some modifications below kwargs_copy = deepcopy(kwargs) # Do this here otherwise we have to produce them for every datasource start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") if start_date is None: start_date = timestamp_epoch() else: start_date = timestamp_tzaware(start_date) if end_date is None: end_date = timestamp_now() else: end_date = timestamp_tzaware(end_date) kwargs_copy["start_date"] = start_date kwargs_copy["end_date"] = end_date skip_ranking = kwargs_copy.get("skip_ranking", False) try: del kwargs_copy["skip_ranking"] except KeyError: pass # As we might have kwargs that are None we want to get rid of those search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None} # Speices translation species = search_kwargs.get("species") if species is not None: if not isinstance(species, list): species = [species] translator = load_json("species_translator.json") updated_species = [] for s in species: updated_species.append(s) try: translated = translator[s] except KeyError: pass else: updated_species.extend(translated) search_kwargs["species"] = updated_species data_type = search_kwargs.get("data_type", "timeseries") valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model") if data_type not in valid_data_types: raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}") # Assume we want timeseries data obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load() if data_type == "footprints": obj = Footprints.load() elif data_type == "emissions": obj = Emissions.load() elif data_type == "eulerian_model": obj = EulerianModel.load() datasource_uuids = obj.datasources() # Shallow load the Datasources so we can search their metadata datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) # For the time being this will return a dict until we know how best to represent # the footprints and emissions results in a SearchResult object if data_type in {"emissions", "footprints", "eulerian_model"}: sources: Dict = aDict() for datasource in datasources: if datasource.search_metadata(**search_kwargs): uid = datasource.uuid() sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) sources[uid]["metadata"] = datasource.metadata() return sources # Find the Datasources that contain matching metadata matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)} # TODO - Update this as it only uses the ACRG repo JSON at the moment # Check if this site only has one inlet, if so skip ranking # if "site" in search_kwargs: # site = search_kwargs["site"] # if not isinstance(site, list) and not multiple_inlets(site=site): # skip_ranking = True # If there isn't *any* ranking data at all, skip all the ranking functionality if not obj._rank_data: skip_ranking = True # If only one datasource has been returned, skip all the ranking functionality if len(matching_sources) == 1: skip_ranking = True # If we have the site, inlet and instrument then just return the data # TODO - should instrument be added here if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True: specific_sources = aDict() for datasource in matching_sources.values(): specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) if not specific_keys: continue metadata = datasource.metadata() site = metadata["site"] species = metadata["species"] inlet = metadata["inlet"] specific_sources[site][species][inlet]["keys"] = specific_keys specific_sources[site][species][inlet]["metadata"] = metadata return SearchResults(results=specific_sources.to_dict(), ranked_data=False) highest_ranked = aDict() for uid, datasource in matching_sources.items(): # Find the site and then the ranking metadata = datasource.metadata() # Get the site inlet and species site = metadata["site"] species = metadata["species"] rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date) # If this Datasource doesn't have any ranking data skip it and move on if not rank_data: continue # There will only be a single rank key rank_value = next(iter(rank_data)) # Get the daterange this rank covers rank_dateranges = rank_data[rank_value] # Each match we store gives us the information we need # to retrieve the data match = {"uuid": uid, "dateranges": rank_dateranges} # Need to ensure we get all the dates covered if species in highest_ranked[site]: species_rank_data = highest_ranked[site][species] # If we have a higher (lower number) rank save it if rank_value < species_rank_data["rank"]: species_rank_data["rank"] = rank_value species_rank_data["matching"] = [match] # If another Datasource has the same rank for another daterange # we want to save that as well elif rank_value == species_rank_data["rank"]: species_rank_data["matching"].append(match) else: highest_ranked[site][species]["rank"] = rank_value highest_ranked[site][species]["matching"] = [match] if not highest_ranked: raise ValueError( ( "No ranking data set for the given search parameters." " Please refine your search to include a specific site, species and inlet." ) ) # Now we have the highest ranked data the dateranges there are ranks for # we want to fill in the gaps with (currently) the highest inlet from that site # We just want some rank_metadata to go along with the final data scheme # Can key a key of date - inlet data_keys: Dict = aDict() for site, species in highest_ranked.items(): for sp, data in species.items(): # data_keys[site][sp]["keys"] = [] species_keys = [] species_rank_data = {} species_metadata = {} for match_data in data["matching"]: uuid = match_data["uuid"] match_dateranges = match_data["dateranges"] # Get the datasource as it's already in the dictionary # we created earlier datasource = matching_sources[uuid] metadata = datasource.metadata() inlet = metadata["inlet"] keys = [] for dr in match_dateranges: date_keys = datasource.keys_in_daterange_str(daterange=dr) if date_keys: keys.extend(date_keys) # We'll add this to the metadata in the search results we return at the end species_rank_data[dr] = inlet species_keys.extend(keys) species_metadata[inlet] = metadata # Only create the dictionary keys if we have some data keys if species_keys: data_keys[site][sp]["keys"] = species_keys data_keys[site][sp]["rank_metadata"] = species_rank_data data_keys[site][sp]["metadata"] = species_metadata else: continue # We now need to retrieve data for the dateranges for which we don't have ranking data # To do this find the gaps in the daterange over which the user has requested data # and the dates for which we have ranking information # Get the dateranges that are covered by ranking information daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]])) # Find the gaps in the ranking coverage gap_dateranges = find_daterange_gaps( start_search=start_date, end_search=end_date, dateranges=daterange_strs ) # We want the dateranges and inlets for those dateranges inlet_dateranges = data_keys[site][sp]["rank_metadata"] # These are the dateranges for which we have ranking information for this site and species ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys()) for gap_daterange in gap_dateranges: # We want to select the inlet that's ranked for dates closest to the ones we have here closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges) gap_start, gap_end = split_daterange_str(gap_daterange) # Find the closest ranked inlet by date chosen_inlet = inlet_dateranges[closest_dr] inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet] inlet_instrument = inlet_metadata["instrument"] inlet_sampling_period = inlet_metadata["sampling_period"] # Then we want to retrieve the correct metadata for those inlets results: SearchResults = search( site=site, species=sp, inlet=chosen_inlet, instrument=inlet_instrument, sampling_period=inlet_sampling_period, start_date=gap_start, end_date=gap_end, ) # type: ignore if not results: continue # Retrieve the data keys inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet) data_keys[site][sp]["keys"].extend(inlet_data_keys) # Remove any duplicate keys data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"])) # TODO - create a stub for addict dict_data_keys = data_keys.to_dict() # type: ignore return SearchResults(results=dict_data_keys, ranked_data=True)
def read_file( filepath: Union[str, Path], model: str, species: str, start_date: Optional[str] = None, end_date: Optional[str] = None, setup: Optional[str] = None, overwrite: bool = False, ) -> Dict: """Read Eulerian model output Args: filepath: Path of Eulerian model species output model: Eulerian model name species: Species name start_date: Start date (inclusive) associated with model run end_date: End date (exclusive) associated with model run setup: Additional setup details for run overwrite: Should this data overwrite currently stored data. """ # TODO: As written, this currently includes some light assumptions that we're dealing with GEOSChem SpeciesConc format. # May need to split out into multiple modules (like with ObsSurface) or into separate retrieve functions as needed. from collections import defaultdict from openghg.util import ( clean_string, hash_file, timestamp_now, timestamp_tzaware, ) from openghg.store import assign_data from xarray import open_dataset from pandas import Timestamp as pd_Timestamp model = clean_string(model) species = clean_string(species) start_date = clean_string(start_date) end_date = clean_string(end_date) setup = clean_string(setup) filepath = Path(filepath) em_store = EulerianModel.load() file_hash = hash_file(filepath=filepath) if file_hash in em_store._file_hashes and not overwrite: raise ValueError( f"This file has been uploaded previously with the filename : {em_store._file_hashes[file_hash]}." ) em_data = open_dataset(filepath) # Check necessary 4D coordinates are present and rename if necessary (for consistency) check_coords = { "time": ["time"], "lat": ["lat", "latitude"], "lon": ["lon", "longitude"], "lev": ["lev", "level", "layer", "sigma_level"], } for name, coord_options in check_coords.items(): for coord in coord_options: if coord in em_data.coords: break else: raise ValueError( "Input data must contain one of '{coord_options}' co-ordinate" ) if name != coord: print("Renaming co-ordinate '{coord}' to '{name}'") em_data = em_data.rename({coord: name}) attrs = em_data.attrs # author_name = "OpenGHG Cloud" # em_data.attrs["author"] = author_name metadata = {} metadata.update(attrs) metadata["model"] = model metadata["species"] = species metadata["processed"] = str(timestamp_now()) if start_date is None: if len(em_data["time"]) > 1: start_date = str(timestamp_tzaware(em_data.time[0].values)) else: try: start_date = attrs["simulation_start_date_and_time"] except KeyError: raise Exception( "Unable to derive start_date from data, please provide as an input." ) else: start_date = timestamp_tzaware(start_date) start_date = str(start_date) if end_date is None: if len(em_data["time"]) > 1: end_date = str(timestamp_tzaware(em_data.time[-1].values)) else: try: end_date = attrs["simulation_end_date_and_time"] except KeyError: raise Exception( "Unable to derive `end_date` from data, please provide as an input." ) else: end_date = timestamp_tzaware(end_date) end_date = str(end_date) date = str(pd_Timestamp(start_date).date()) metadata["date"] = date metadata["start_date"] = start_date metadata["end_date"] = end_date metadata["max_longitude"] = round(float(em_data["lon"].max()), 5) metadata["min_longitude"] = round(float(em_data["lon"].min()), 5) metadata["max_latitude"] = round(float(em_data["lat"].max()), 5) metadata["min_latitude"] = round(float(em_data["lat"].min()), 5) history = metadata.get("history") if history is None: history = "" metadata[ "history"] = history + f" {str(timestamp_now())} Processed onto OpenGHG cloud" key = "_".join((model, species, date)) model_data: DefaultDict[str, Dict[str, Union[Dict, Dataset]]] = defaultdict(dict) model_data[key]["data"] = em_data model_data[key]["metadata"] = metadata keyed_metadata = {key: metadata} lookup_results = em_store.datasource_lookup(metadata=keyed_metadata) data_type = "eulerian_model" datasource_uuids = assign_data( data_dict=model_data, lookup_results=lookup_results, overwrite=overwrite, data_type=data_type, ) em_store.add_datasources(datasource_uuids=datasource_uuids, metadata=keyed_metadata) # Record the file hash in case we see this file again em_store._file_hashes[file_hash] = filepath.name em_store.save() return datasource_uuids
def read_file( filepath: Union[str, Path], species: str, source: str, domain: str, date: str, high_time_resolution: Optional[bool] = False, period: Optional[str] = None, overwrite: bool = False, ) -> Dict: """Read emissions file Args: filepath: Path of emissions file species: Species name domain: Emissions domain source: Emissions source high_time_resolution: If this is a high resolution file period: Period of measurements, if not passed this is inferred from the time coords overwrite: Should this data overwrite currently stored data. """ from collections import defaultdict from xarray import open_dataset from openghg.store import assign_data from openghg.util import ( clean_string, hash_file, timestamp_tzaware, timestamp_now, ) species = clean_string(species) source = clean_string(source) domain = clean_string(domain) date = clean_string(date) filepath = Path(filepath) em_store = Emissions.load() file_hash = hash_file(filepath=filepath) if file_hash in em_store._file_hashes and not overwrite: raise ValueError( f"This file has been uploaded previously with the filename : {em_store._file_hashes[file_hash]}." ) em_data = open_dataset(filepath) # Some attributes are numpy types we can't serialise to JSON so convert them # to their native types here attrs = {} for key, value in em_data.attrs.items(): try: attrs[key] = value.item() except AttributeError: attrs[key] = value author_name = "OpenGHG Cloud" em_data.attrs["author"] = author_name metadata = {} metadata.update(attrs) metadata["species"] = species metadata["domain"] = domain metadata["source"] = source metadata["date"] = date metadata["author"] = author_name metadata["processed"] = str(timestamp_now()) metadata["start_date"] = str(timestamp_tzaware(em_data.time[0].values)) metadata["end_date"] = str(timestamp_tzaware(em_data.time[-1].values)) metadata["max_longitude"] = round(float(em_data["lon"].max()), 5) metadata["min_longitude"] = round(float(em_data["lon"].min()), 5) metadata["max_latitude"] = round(float(em_data["lat"].max()), 5) metadata["min_latitude"] = round(float(em_data["lat"].min()), 5) metadata["time_resolution"] = "high" if high_time_resolution else "standard" if period is not None: metadata["time_period"] = period key = "_".join((species, source, domain, date)) emissions_data: DefaultDict[str, Dict[str, Union[Dict, Dataset]]] = defaultdict(dict) emissions_data[key]["data"] = em_data emissions_data[key]["metadata"] = metadata keyed_metadata = {key: metadata} lookup_results = em_store.datasource_lookup(metadata=keyed_metadata) data_type = "emissions" datasource_uuids = assign_data( data_dict=emissions_data, lookup_results=lookup_results, overwrite=overwrite, data_type=data_type, ) em_store.add_datasources(datasource_uuids=datasource_uuids, metadata=keyed_metadata) # Record the file hash in case we see this file again em_store._file_hashes[file_hash] = filepath.name em_store.save() return datasource_uuids
def get_flux( species: str, sources: Union[str, List[str]], domain: str, start_date: Optional[Timestamp] = None, end_date: Optional[Timestamp] = None, time_resolution: Optional[str] = "standard", ) -> FluxData: """ The flux function reads in all flux files for the domain and species as an xarray Dataset. Note that at present ALL flux data is read in per species per domain or by emissions name. To be consistent with the footprints, fluxes should be in mol/m2/s. Args: species: Species name sources: Source name domain: Domain e.g. EUROPE start_date: Start date end_date: End date time_resolution: One of ["standard", "high"] Returns: FluxData: FluxData object TODO: Update this to output to a FluxData class? TODO: Update inputs to just accept a string and extract one flux file at a time? As it stands, this only extracts one flux at a time but is set up to be extended to to extract multiple. So if this is removed from this function the functionality itself would need to be wrapped up in another function call. """ from openghg.retrieve import search from openghg.store import recombine_datasets from openghg.util import timestamp_epoch, timestamp_now if start_date is None: start_date = timestamp_epoch() if end_date is None: end_date = timestamp_now() results: Dict = search( species=species, source=sources, domain=domain, time_resolution=time_resolution, start_date=start_date, end_date=end_date, data_type="emissions", ) # type: ignore if not results: raise ValueError( f"Unable to find flux data for {species} from {sources}") # TODO - more than one emissions file (but see above) try: em_key = list(results.keys())[0] except IndexError: raise ValueError( f"Unable to find any footprints data for {domain} for {species}.") data_keys = results[em_key]["keys"] metadata = results[em_key]["metadata"] em_ds = recombine_datasets(keys=data_keys, sort=False) # Check for level coordinate. If one level, assume surface and drop if "lev" in em_ds.coords: if len(em_ds.lev) > 1: raise ValueError("Error: More than one flux level") em_ds = em_ds.drop_vars(names="lev") if species is None: species = metadata.get("species", "NA") return FluxData( data=em_ds, metadata=metadata, flux={}, bc={}, species=species, scales="FIXME", units="FIXME", )
def read_file( filepath: Union[str, Path], site: str, height: str, domain: str, model: str, metmodel: Optional[str] = None, species: Optional[str] = None, network: Optional[str] = None, retrieve_met: bool = False, overwrite: bool = False, high_res: bool = False, # model_params: Optional[Dict] = None, ) -> Dict[str, str]: """Reads footprints data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: filepath: Path of file to load site: Site name network: Network name height: Height above ground level in metres domain: Domain of footprints model_params: Model run parameters retrieve_met: Whether to also download meterological data for this footprints area overwrite: Overwrite any currently stored data Returns: dict: UUIDs of Datasources data has been assigned to """ from collections import defaultdict from xarray import open_dataset from openghg.util import ( hash_file, timestamp_tzaware, timestamp_now, clean_string, ) from openghg.store import assign_data filepath = Path(filepath) site = clean_string(site) network = clean_string(network) height = clean_string(height) domain = clean_string(domain) fp = Footprints.load() file_hash = hash_file(filepath=filepath) if file_hash in fp._file_hashes and not overwrite: raise ValueError( f"This file has been uploaded previously with the filename : {fp._file_hashes[file_hash]}." ) fp_data = open_dataset(filepath) # Need to read the metadata from the footprints and then store it # Do we need to chunk the footprints / will a Datasource store it correctly? metadata: Dict[str, Union[str, float, List[float]]] = {} metadata["data_type"] = "footprints" metadata["site"] = site metadata["height"] = height metadata["domain"] = domain metadata["model"] = model if species is not None: metadata["species"] = clean_string(species) if network is not None: metadata["network"] = clean_string(network) if metmodel is not None: metadata["metmodel"] = clean_string(metmodel) metadata["start_date"] = str(timestamp_tzaware(fp_data.time[0].values)) metadata["end_date"] = str(timestamp_tzaware(fp_data.time[-1].values)) metadata["max_longitude"] = round(float(fp_data["lon"].max()), 5) metadata["min_longitude"] = round(float(fp_data["lon"].min()), 5) metadata["max_latitude"] = round(float(fp_data["lat"].max()), 5) metadata["min_latitude"] = round(float(fp_data["lat"].min()), 5) metadata["time_resolution"] = "standard_time_resolution" # If it's a high resolution footprints file we'll have two sets of lat/long values if high_res: try: metadata["max_longitude_high"] = round( float(fp_data["lon_high"].max()), 5) metadata["min_longitude_high"] = round( float(fp_data["lon_high"].min()), 5) metadata["max_latitude_high"] = round( float(fp_data["lat_high"].max()), 5) metadata["min_latitude_high"] = round( float(fp_data["lat_high"].min()), 5) metadata["time_resolution"] = "high_time_resolution" except KeyError: raise KeyError("Unable to find lat_high or lon_high data.") metadata["heights"] = [float(h) for h in fp_data.height.values] # Do we also need to save all the variables we have available in this footprints? metadata["variables"] = list(fp_data.keys()) # if model_params is not None: # metadata["model_parameters"] = model_params # Set the attributes of this Dataset fp_data.attrs = { "author": "OpenGHG Cloud", "processed": str(timestamp_now()) } # This might seem longwinded now but will help when we want to read # more than one footprints at a time key = "_".join((site, domain, model, height)) footprint_data: DefaultDict[str, Dict[str, Union[Dict, Dataset]]] = defaultdict(dict) footprint_data[key]["data"] = fp_data footprint_data[key]["metadata"] = metadata # This will be removed when we process multiple files keyed_metadata = {key: metadata} lookup_results = fp.datasource_lookup(metadata=keyed_metadata) data_type = "footprints" datasource_uuids: Dict[str, str] = assign_data( data_dict=footprint_data, lookup_results=lookup_results, overwrite=overwrite, data_type=data_type, ) fp.add_datasources(datasource_uuids=datasource_uuids, metadata=keyed_metadata) # Record the file hash in case we see this file again fp._file_hashes[file_hash] = filepath.name fp.save() return datasource_uuids
def save(self, bucket: Optional[str] = None) -> None: """Save this Datasource object as JSON to the object store Args: bucket: Bucket to hold data Returns: None """ import tempfile from copy import deepcopy from openghg.util import timestamp_now from openghg.objectstore import ( get_bucket, set_object_from_file, set_object_from_json, ) if bucket is None: bucket = get_bucket() if self._data: # Ensure we have the latest key if "latest" not in self._data_keys: self._data_keys["latest"] = {} # Backup the old data keys at "latest" version_str = f"v{str(len(self._data_keys))}" # Store the keys for the new data new_keys = {} # Iterate over the keys (daterange string) of the data dictionary for daterange in self._data: data_key = f"{Datasource._data_root}/uuid/{self._uuid}/{version_str}/{daterange}" new_keys[daterange] = data_key data = self._data[daterange] # TODO - for now just create a temporary directory - will have to update Acquire # or work on a PR for xarray to allow returning a NetCDF as bytes with tempfile.TemporaryDirectory() as tmpdir: filepath = f"{tmpdir}/temp.nc" data.to_netcdf(filepath) set_object_from_file(bucket=bucket, key=data_key, filename=filepath) # Copy the last version if "latest" in self._data_keys: self._data_keys[version_str] = deepcopy( self._data_keys["latest"]) # Save the new keys and create a timestamp self._data_keys[version_str]["keys"] = new_keys self._data_keys[version_str]["timestamp"] = str( timestamp_now()) # type: ignore # Link latest to the newest version self._data_keys["latest"] = self._data_keys[version_str] self._latest_version = version_str self._stored = True datasource_key = f"{Datasource._datasource_root}/uuid/{self._uuid}" set_object_from_json(bucket=bucket, key=datasource_key, data=self.to_data())
def get_attributes( ds: Dataset, species: str, site: str, network: str = None, global_attributes: Dict[str, str] = None, units: str = None, scale: str = None, sampling_period: str = None, date_range: List[str] = None, ) -> Dataset: """ This function writes attributes to an xarray.Dataset so that they conform with the CF Convention v1.6 Attributes of the xarray DataSet are modified, and variable names are changed If the species is a standard mole fraction then either: - species name will used in lower case in the file and variable names but with any hyphens taken out - name will be changed according to the species_translator dictionary If the species is isotopic data or a non-standard variable (e.g. APO): - Isotopes species names should begin with a "D" (Annoyingly, the code currently picks up "Desflurane" too. I've fixed this for now, but if we get a lot of other "D" species, we should make this better) - I suggest naming for isotopologues should be d<species><isotope>, e.g. dCH4C13, or dCO2C14 - Any non-standard variables should be listed in the species_translator dictionary Args: ds: Should contain variables such as "ch4", "ch4 repeatability". Must have a "time" dimension. species: Species name. e.g. "CH4", "HFC-134a", "dCH4C13" site: Three-letter site code network: Network site is associated with global_attribuates: Dictionary containing any info you want to add to the file header (e.g. {"Contact": "Contact_Name"}) units: This routine will try to guess the units unless this is specified. Options are in units_interpret scale: Calibration scale for species. sampling_period: Number of seconds for which air sample is taken. Only for time variable attribute date_range: Start and end date for output If you only want an end date, just put a very early start date (e.g. ["1900-01-01", "2010-01-01"]) """ from pandas import Timestamp as pd_Timestamp from openghg.util import clean_string, load_json, timestamp_now # from numpy import unique as np_unique if not isinstance(ds, Dataset): raise TypeError("This function only accepts xarray Datasets") # Current CF Conventions (v1.7) demand that valid variable names # begin with a letter and be composed of letters, digits and underscores # Here variable names are also made lowercase to enable easier matching below # TODO - could I just cast ds.variables as as type for mypy instead of doing this? # variable_names = [str(v) for v in ds.variables] # Is this better? variable_names = cast(Dict[str, Any], ds.variables) to_underscores = {var: var.lower().replace(" ", "_") for var in variable_names} ds = ds.rename(to_underscores) # type: ignore species_attrs = load_json(filename="species_attributes.json") attributes_data = load_json("attributes.json") species_translator = attributes_data["species_translation"] unit_species = attributes_data["unit_species"] unit_species_long = attributes_data["unit_species_long"] unit_interpret = attributes_data["unit_interpret"] species_upper = species.upper() species_lower = species.lower() variable_names = cast(Dict[str, Any], ds.variables) matched_keys = [var for var in variable_names if species_lower in var] # If we don't have any variables to rename, raise an error if not matched_keys: raise NameError(f"Cannot find species {species} in Dataset variables") species_rename = {} for var in matched_keys: try: species_label = species_translator[species_upper]["chem"] except KeyError: species_label = clean_string(species_lower) species_rename[var] = var.replace(species_lower, species_label) ds = ds.rename(species_rename) # type: ignore # Global attributes global_attributes_default = { "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.", "source": "In situ measurements of air", "Conventions": "CF-1.6", } if global_attributes is not None: # TODO - for some reason mypy doesn't see a Dict[str,str] as a valid Mapping[Hashable, Any] type global_attributes.update(global_attributes_default) # type: ignore else: global_attributes = global_attributes_default global_attributes["file_created"] = str(timestamp_now()) global_attributes["processed_by"] = "OpenGHG_Cloud" global_attributes["species"] = species_label if scale is None: global_attributes["calibration_scale"] = "unknown" else: global_attributes["calibration_scale"] = scale # Update the Dataset attributes ds.attrs.update(global_attributes) # type: ignore # Add some site attributes site_attributes = _site_info_attributes(site.upper(), network) ds.attrs.update(site_attributes) # Species-specific attributes # Long name if species_upper.startswith("D") and species_upper != "DESFLURANE" or species_upper == "APD": sp_long = species_translator[species_upper]["name"] elif species_upper == "RN": sp_long = "radioactivity_concentration_of_222Rn_in_air" elif species_upper in species_translator: name = species_translator[species_upper]["name"] sp_long = f"mole_fraction_of_{name}_in_air" else: sp_long = f"mole_fraction_of_{species_label}_in_air" ancillary_variables = [] variable_names = cast(Dict[str, Any], ds.variables) matched_keys = [var for var in variable_names if species_lower in var.lower()] # Write units as attributes to variables containing any of these match_words = ["variability", "repeatability", "stdev", "count"] for key in variable_names: key = key.lower() if species_label.lower() in key: # Standard name attribute # ds[key].attrs["standard_name"]=key.replace(species_label, sp_long) ds[key].attrs["long_name"] = key.replace(species_label, sp_long) # If units are required for variable, add attribute if key == species_label or any(word in key for word in match_words): if units is not None: if units in unit_interpret: ds[key].attrs["units"] = unit_interpret[units] else: ds[key].attrs["units"] = unit_interpret["else"] else: # TODO - merge these species attributes into a single simpler JSON try: ds[key].attrs["units"] = unit_species[species_upper] except KeyError: try: ds[key].attrs["units"] = species_attrs[species_label.upper()]["units"] except KeyError: ds[key].attrs["units"] = "NA" # If units are non-standard, add explanation if species_upper in unit_species_long: ds[key].attrs["units_description"] = unit_species_long[species_upper] # Add to list of ancilliary variables if key != species_label: ancillary_variables.append(key) # TODO - for the moment skip this step - check status of ancilliary variables in standard # Write ancilliary variable list # ds[species_label].attrs["ancilliary_variables"] = ", ".join(ancillary_variables) # Add quality flag attributes # NOTE - I've removed the whitespace before status_flag and integration_flag here variable_names = cast(Dict[str, Any], ds.variables) quality_flags = [key for key in variable_names if "status_flag" in key] # Not getting long_name for c2f6 for key in quality_flags: ds[key] = ds[key].astype(int) try: long_name = ds[species_label].attrs["long_name"] except KeyError: raise KeyError(key, quality_flags) ds[key].attrs = { "flag_meaning": "0 = unflagged, 1 = flagged", "long_name": f"{long_name} status_flag", } variable_names = cast(Dict[str, Any], ds.variables) # Add integration flag attributes integration_flags = [key for key in variable_names if "integration_flag" in key] for key in integration_flags: ds[key] = ds[key].astype(int) long_name = ds[species_label].attrs["long_name"] ds[key].attrs = { "flag_meaning": "0 = area, 1 = height", "standard_name": f"{long_name} integration_flag", "comment": "GC peak integration method (by height or by area). Does not indicate data quality", } # Set time encoding # Check if there are duplicate time stamps # I feel there should be a more pandas way of doing this # but xarray doesn't currently have a duplicates method # See this https://github.com/pydata/xarray/issues/2108 # if len(set(ds.time.values)) < len(ds.time.values): # if len(np_unique(ds.time.values)) < len(ds.time.values): # print("WARNING. Duplicate time stamps") first_year = pd_Timestamp(str(ds.time[0].values)).year ds.time.encoding = {"units": f"seconds since {str(first_year)}-01-01 00:00:00"} time_attributes: Dict[str, str] = {} time_attributes["label"] = "left" time_attributes["standard_name"] = "time" time_attributes["comment"] = ( "Time stamp corresponds to beginning of sampling period. " + "Time since midnight UTC of reference date. " + "Note that sampling periods are approximate." ) if sampling_period is not None: time_attributes["sampling_period_seconds"] = sampling_period ds.time.attrs.update(time_attributes) # If a date range is specified, slice dataset if date_range: ds = ds.loc[dict(time=slice(*date_range))] return ds