def __init__(self): from HUGS.Util import load_hugs_json self._sampling_period = 0 # Load site data data = load_hugs_json(filename="process_gcwerks_parameters.json") self._gc_params = data["GCWERKS"] # Site codes for inlet readings self._site_codes = load_hugs_json(filename="site_codes.json")
def __init__(self): # Holds parameters used for writing attributes to Datasets self._crds_params = {} # Sampling period of CRDS data in seconds self._sampling_period = 60 data = load_hugs_json(filename="process_gcwerks_parameters.json") self._crds_params = data["CRDS"]
def __init__(self): from HUGS.Util import load_hugs_json self._eurocom_params = {} # Sampling period of EUROCOM data in seconds self._sampling_period = 60 data = load_hugs_json(filename="attributes.json") self._eurocom_params = data["EUROCOM"]
def __init__(self): from HUGS.Util import load_hugs_json # Holds parameters used for writing attributes to Datasets self._tb_params = {} # Sampling period of data in seconds self._sampling_period = "NA" data = load_hugs_json(filename="attributes.json") self._tb_params = data["TMB"]
def synonyms(species: str) -> str: """ Check to see if there are other names that we should be using for a particular input. E.g. If CFC-11 or CFC11 was input, go on to use cfc-11, as this is used in species_info.json Args: species (str): Input string that you're trying to match Returns: str: Matched species string """ from HUGS.Util import load_hugs_json # Load in the species data species_data = load_hugs_json(filename="acrg_species_info.json") # First test whether site matches keys (case insensitive) matched_strings = [k for k in species_data if k.upper() == species.upper()] # Used to access the alternative names in species_data alt_label = "alt" # If not found, search synonyms if not matched_strings: for key in species_data: # Iterate over the alternative labels and check for a match matched_strings = [ s for s in species_data[key][alt_label] if s.upper() == species.upper() ] if matched_strings: matched_strings = [key] break if matched_strings: updated_species = matched_strings[0] return updated_species else: raise ValueError(f"Unable to find synonym for species {species}")
def get_site_attributes(self, site, inlet): """ Gets the site specific attributes for writing to Datsets Args: site (str): Site name inlet (str): Inlet (example: 108m) Returns: dict: Dictionary of attributes """ from HUGS.Util import load_hugs_json if not self._crds_params: data = load_hugs_json(filename="process_gcwerks_parameters.json") self._crds_params = data["CRDS"] try: attributes = self._crds_params[site.upper()]["global_attributes"] except KeyError: raise ValueError(f"Unable to read attributes for site: {site}") attributes["inlet_height_magl"] = inlet.split("_")[0] attributes["comment"] = self._crds_params["comment"] return attributes
def read_data(self, data_filepath, site, network): """ Separates the gases stored in the dataframe in separate dataframes and returns a dictionary of gases with an assigned UUID as gas:UUID and a list of the processed dataframes Args: data_filepath (pathlib.Path): Path of datafile Returns: dict: Dictionary containing metadata, data and attributes keys """ from datetime import datetime from pandas import RangeIndex, read_csv, NaT # At the moment we're using the filename as the source name source_name = data_filepath.stem # -1 here as we've already removed the file extension # As we're not processing a list of datafiles here we'll only have one inlet inlet = source_name.split(".")[3] if "m" not in inlet.lower(): raise ValueError( "No inlet found, we expect filenames such as: bsd.picarro.1minute.108m.dat" ) # Function to parse the datetime format found in the datafile def parse_date(date): try: return datetime.strptime(date, "%y%m%d %H%M%S") except ValueError: return NaT data = read_csv( data_filepath, header=None, skiprows=1, sep=r"\s+", index_col=["0_1"], parse_dates=[[0, 1]], date_parser=parse_date, ) data.index.name = "time" # Drop any rows with NaNs # This is now done before creating metadata data = data.dropna(axis="rows", how="any") # Get the number of gases in dataframe and number of columns of data present for each gas n_gases, n_cols = self.gas_info(data=data) header = data.head(2) skip_cols = sum( [header[column][0] == "-" for column in header.columns]) metadata = self.read_metadata(filepath=data_filepath, data=data) if network is not None: metadata["network"] = network # Read the scale from JSON crds_data = load_hugs_json(filename="process_gcwerks_parameters.json") # This dictionary is used to store the gas data and its associated metadata combined_data = {} for n in range(n_gases): # Slice the columns gas_data = data.iloc[:, skip_cols + n * n_cols:skip_cols + (n + 1) * n_cols] # Reset the column numbers gas_data.columns = RangeIndex(gas_data.columns.size) species = gas_data[0][0] species = species.lower() column_labels = [species, f"{species} stdev", f"{species} n_meas"] # Name columns gas_data = gas_data.set_axis(column_labels, axis="columns", inplace=False) header_rows = 2 # Drop the first two rows now we have the name gas_data = gas_data.drop(index=gas_data.head(header_rows).index, inplace=False) # Cast data to float64 / double gas_data = gas_data.astype("float64") # Here we can convert the Dataframe to a Dataset and then write the attributes gas_data = gas_data.to_xarray() site_attributes = self.get_site_attributes(site=site, inlet=inlet) # Create a copy of the metadata dict scale = crds_data["CRDS"]["default_scales"].get(species.upper()) species_metadata = metadata.copy() species_metadata["species"] = species species_metadata["inlet"] = inlet species_metadata["scale"] = scale combined_data[species] = { "metadata": species_metadata, "data": gas_data, "attributes": site_attributes, } return combined_data
def get_single_site( site: str, species: str, network: Optional[str] = None, start_date: Optional[Union[str, Timestamp]] = None, end_date: Optional[Union[str, Timestamp]] = None, inlet: Optional[str] = None, average: Optional[str] = None, instrument: Optional[str] = None, keep_missing: Optional[bool] = False, calibration_scale: Optional[str] = None, ) -> list: """ Get measurements from one site as a list of xarray datasets. If there are multiple instruments and inlets at a particular site, note that the acrg_obs_defaults.csv file may be referenced to determine which instrument and inlet to use for each time period. If an inlet or instrument changes at some point during time period, multiple datasets will be returned, one for each inlet/instrument. Args: site: Site of interest e.g. MHD for the Mace Head site. species_in (str) : Species identifier e.g. ch4 for methane. start_date: Output start date in a format that Pandas can interpret end_date: Output end date in a format that Pandas can interpret inlet: Inlet label. If you want to merge all inlets, use "all" average: Averaging period for each dataset. Each value should be a string of the form e.g. "2H", "30min" (should match pandas offset aliases format). keep_missing: Whether to keep missing data points or drop them. network: Network for the site/instrument (must match number of sites). instrument: Specific instrument for the site (must match number of sites). calibration_scale: Convert to this calibration scale (original scale and new scale must both be in acrg_obs_scale_convert.csv) Returns: list: List of xarray.Datasets """ from pandas import Timestamp, Timedelta import numpy as np from xarray import concat as xr_concat from HUGS.LocalClient import Search from HUGS.Util import load_hugs_json site_info = load_hugs_json(filename="acrg_site_info.json") site = site.upper() if site not in site_info: raise ValueError( f"No site called {site}, please enter a valid site name.") # Ensure we have the Timestamps we expect if start_date is not None and not isinstance(start_date, Timestamp): start_date = Timestamp(start_date) if end_date is not None and not isinstance(end_date, Timestamp): end_date = Timestamp(end_date) # Find the correct synonym for the passed species species = synonyms(species) search = Search() results = search.search( species=species, locations=site, inlet=inlet, instrument=instrument, start_datetime=start_date, end_datetime=end_date, ) # Retrieve all the data found selected_keys = [k for k in results] retrieved_data = search.retrieve(selected_keys=selected_keys) obs_files = [] for key, dateranges in retrieved_data.items(): for d in dateranges: split_dates = d.split("_") start_date = Timestamp(split_dates[0]) end_date = Timestamp(split_dates[1]) data = dateranges[d] if average is not None: if keep_missing is True: # Create a dataset with one element and NaNs to prepend or append ds_single_element = data[{"time": 0}] for v in ds_single_element.variables: if v != "time": ds_single_element[v].values = np.nan ds_concat = [] # Pad with an empty entry at the start date if min(data.time) > Timestamp(start_date): ds_single_element_start = ds_single_element.copy() ds_single_element_start.time.values = Timestamp( start_date) ds_concat.append(ds_single_element_start) ds_concat.append(data) # Pad with an empty entry at the end date if max(data.time) < Timestamp(end_date): ds_single_element_end = ds_single_element.copy() ds_single_element_end.time.values = Timestamp( end_date) - Timedelta("1ns") ds_concat.append(ds_single_element_end) data = xr_concat(ds_concat, dim="time") # Now sort to get everything in the right order data = data.sortby("time") # First do a mean resample on all variables ds_resampled = data.resample( time=average, keep_attrs=True).mean(skipna=False) # keep_attrs doesn't seem to work for some reason, so manually copy ds_resampled.attrs = data.attrs.copy() # For some variables, need a different type of resampling for var in data.variables: if "repeatability" in var: ds_resampled[var] = ( np.sqrt( (data[var]**2).resample(time=average).sum()) / data[var].resample(time=average).count()) elif "variability" in var: # Calculate std of 1 min mf obs in av period as new vmf ds_resampled[var] = (data[var].resample( time=average, keep_attrs=True).std(skipna=False)) # Copy over some attributes if "long_name" in data[var].attrs: ds_resampled[var].attrs["long_name"] = data[var].attrs[ "long_name"] if "units" in data[var].attrs: ds_resampled[var].attrs["units"] = data[var].attrs[ "units"] data = ds_resampled.copy() # Rename variables rename = {} for var in data.variables: if var.lower() == species.lower(): rename[var] = "mf" if "repeatability" in var: rename[var] = "mf_repeatability" if "variability" in var: rename[var] = "mf_variability" if "number_of_observations" in var: rename[var] = "mf_number_of_observations" if "status_flag" in var: rename[var] = "status_flag" if "integration_flag" in var: rename[var] = "integration_flag" data = data.rename_vars(rename) data.attrs["species"] = species if "Calibration_scale" in data.attrs: data.attrs["scale"] = data.attrs.pop("Calibration_scale") if calibration_scale is not None: data = scale_convert(data, species, calibration_scale) obs_files.append(data) # Now check if the units match for each of the observation Datasets units = set([f.mf.attrs["units"] for f in obs_files]) if len(units) > 1: raise ValueError( f"Units do not match for these observation Datasets {[(f.mf.attrs['units'],f.attrs['filename']) for f in obs_files]}" ) scales = set([f.attrs["scale"] for f in obs_files]) if len(scales) > 1: print( f"Scales do not match for these observation Datasets {[(f.attrs['scale'],f.attrs['filename']) for f in obs_files]}" ) print("Suggestion: set calibration_scale to convert scales") return obs_files
def __init__(self): from HUGS.Util import load_hugs_json # Holds parameters used for writing attributes to Datasets data = load_hugs_json("attributes.json") self._noaa_params = data["NOAA"]