def test_key_date_compare(): d = Datasource() keys = { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00": "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00": "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00": "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00": "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00": "data/uuid/test-uid/v1/2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00": "data/uuid/test-uid/v1/2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00": "data/uuid/test-uid/v1/2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", } start = timestamp_tzaware("2014-01-01") end = timestamp_tzaware("2018-01-01") in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end) expected = [ "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", ] assert in_date == expected start = timestamp_tzaware("2053-01-01") end = timestamp_tzaware("2092-01-01") in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end) assert not in_date error_key = { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00_2014-11-30-11:23:30+00:00": "broken" } with pytest.raises(ValueError): in_date = d.key_date_compare(keys=error_key, start_date=start, end_date=end)
def from_data(cls: Type[T], bucket: str, data: Dict, shallow: bool) -> T: """Construct a Datasource from JSON Args: bucket: Bucket containing data data: JSON data shallow: Load only the JSON data, do not retrieve data from the object store Returns: Datasource: Datasource created from JSON """ from openghg.util import timestamp_tzaware d = cls() d._uuid = data["UUID"] d._creation_datetime = timestamp_tzaware(data["creation_datetime"]) d._metadata = data["metadata"] d._stored = data["stored"] d._data_keys = data["data_keys"] d._data = {} d._data_type = data["data_type"] d._latest_version = data["latest_version"] if d._stored and not shallow: for date_key in d._data_keys["latest"]["keys"]: data_key = d._data_keys["latest"]["keys"][date_key] d._data[date_key] = Datasource.load_dataset(bucket=bucket, key=data_key) d._stored = False d.update_daterange() return d
def in_daterange(self, start_date: Union[str, Timestamp], end_date: Union[str, Timestamp]) -> bool: """Check if the data contained within this Datasource overlaps with the dates given. Args: start: Start datetime end: End datetime Returns: bool: True if overlap """ from openghg.util import timestamp_tzaware # if self._start_date is None or self._end_date is None: # self.update_daterange() start_date = timestamp_tzaware(start_date) end_date = timestamp_tzaware(end_date) return bool((start_date <= self._end_date) and (end_date >= self._start_date))
def get_dataframe_daterange( self, dataframe: DataFrame) -> Tuple[Timestamp, Timestamp]: """Returns the daterange for the passed DataFrame Args: dataframe: DataFrame to parse Returns: tuple (Timestamp, Timestamp): Start and end Timestamps for data """ from pandas import DatetimeIndex from openghg.util import timestamp_tzaware if not isinstance(dataframe.index, DatetimeIndex): raise TypeError( "Only DataFrames with a DatetimeIndex must be passed") # Here we want to make the pandas Timestamps timezone aware start = timestamp_tzaware(dataframe.first_valid_index()) end = timestamp_tzaware(dataframe.last_valid_index()) return start, end
def retrieve_met( site: str, network: str, years: Union[str, List[str]], variables: Optional[List[str]] = None, ) -> METData: """Retrieve METData data. Note that this function will only download a full year of data which may take some time. This function currently on retrieves data from the "reanalysis-era5-pressure-levels" dataset but may be modified for other datasets in the future. Args: site: Three letter sitec code network: Network years: Year(s) of data required Returns: METData: METData object holding data and metadata """ from openghg.dataobjects import METData if variables is None: variables = ["u_component_of_wind", "v_component_of_wind"] latitude, longitude, site_height, inlet_heights = _get_site_data( site=site, network=network) # Get the area to retrieve data for ecmwf_area = _get_ecmwf_area(site_lat=latitude, site_long=longitude) # Calculate the pressure at measurement height(s) measure_pressure = _get_site_pressure(inlet_heights=inlet_heights, site_height=site_height) # Calculate the ERA5 pressure levels required ecmwf_pressure_levels = _altitude_to_ecmwf_pressure( measure_pressure=measure_pressure) if not isinstance(years, list): years = [years] else: years = sorted(years) # TODO - we might need to customise this further in the future to # request other types of weather data request = { "product_type": "reanalysis", "format": "netcdf", "variable": variables, "pressure_level": ecmwf_pressure_levels, "year": [str(x) for x in years], "month": [str(x).zfill(2) for x in range(1, 13)], "day": [str(x).zfill(2) for x in range(1, 32)], "time": [f"{str(x).zfill(2)}:00" for x in range(0, 24)], "area": ecmwf_area, } cds_client = cdsapi.Client() dataset_name = "reanalysis-era5-pressure-levels" # Retrieve metadata from Copernicus about the dataset, this includes # the location of the data netCDF file. result = cds_client.retrieve(name=dataset_name, request=request) # Download the data itself dataset = _download_data(url=result.location) # dataset = xr.open_dataset("/home/gar/Documents/Devel/RSE/openghg/tests/data/request_return.nc") # We replace the date data with a start and end date here start_date = str(timestamp_tzaware(f"{years[0]}-1-1")) end_date = str(timestamp_tzaware(f"{years[-1]}-12-31")) metadata = { "product_type": request["product_type"], "format": request["format"], "variable": request["variable"], "pressure_level": request["pressure_level"], "area": request["area"], "site": site, "network": network, "start_date": start_date, "end_date": end_date, } return METData(data=dataset, metadata=metadata)
def single_site_footprint( site: str, height: str, network: str, domain: str, species: str, start_date: Union[str, Timestamp], end_date: Union[str, Timestamp], resample_to: str = "coarsest", site_modifier: Optional[str] = None, platform: Optional[str] = None, instrument: Optional[str] = None, ) -> Dataset: """Creates a Dataset for a single site's measurement data and footprints Args: site: Site name height: Height of inlet in metres network: Network name resample_to: Resample the data to a given time dataset. Valid options are ["obs", "footprints", "coarsen"]. - "obs" resamples the footprints to the observation time series data - "footprints" resamples to to the footprints time series - "coarsest" resamples to the data with the coarsest time resolution site_modifier: The name of the site given in the footprints. This is useful for example if the same site footprints are run with a different met and they are named slightly differently from the obs file. E.g. site="DJI", site_modifier = "DJI-SAM" - station called DJI, footprints site called DJI-SAM platform: Observation platform used to decide whether to resample instrument: species: Species type Returns: xarray.Dataset """ from openghg.retrieve import get_obs_surface, get_footprint from openghg.util import timestamp_tzaware start_date = timestamp_tzaware(start_date) end_date = timestamp_tzaware(end_date) resample_to = resample_to.lower() resample_choices = ("obs", "footprints", "coarsest") if resample_to not in resample_choices: raise ValueError( f"Invalid resample choice {resample_to} past, please select from one of {resample_choices}" ) # As we're not retrieve any satellite data yet just set tolerance to None tolerance = None platform = None # Here we want to use get_obs_surface obs_results = get_obs_surface( site=site, inlet=height, start_date=start_date, end_date=end_date, species=species, instrument=instrument, ) obs_data = obs_results.data # Save the observation data units try: units: Union[float, None] = float(obs_data.mf.attrs["units"]) except KeyError: units = None except AttributeError: raise AttributeError( "Unable to read mf attribute from observation data.") # If the site for the footprints has a different name, pass that in if site_modifier: footprint_site = site_modifier else: footprint_site = site # Try to find appropriate footprints file first with and then without species name try: footprint = get_footprint( site=footprint_site, domain=domain, height=height, start_date=start_date, end_date=end_date, species=species, ) except ValueError: footprint = get_footprint( site=footprint_site, domain=domain, height=height, start_date=start_date, end_date=end_date, ) # TODO: Add checks for particular species e.g. co2 and short-lived species # which should have a specific footprints available rather than the generic one # Extract dataset footprint_data = footprint.data # Align the two Datasets aligned_obs, aligned_footprint = align_datasets( obs_data=obs_data, footprint_data=footprint_data, platform=platform, resample_to=resample_to, ) combined_dataset = combine_datasets(dataset_A=aligned_obs, dataset_B=aligned_footprint, tolerance=tolerance) # Transpose to keep time in the last dimension position in case it has been moved in resample combined_dataset = combined_dataset.transpose(..., "time") if units is not None: combined_dataset["fp"].values = combined_dataset["fp"].values / units # if HiTRes: # combined_dataset.update({"fp_HiTRes": (combined_dataset.fp_HiTRes.dims, (combined_dataset.fp_HiTRes / units))}) return combined_dataset