def get_rank(self: T, uuid: str, start_date: Timestamp, end_date: Timestamp) -> Dict: """Get the rank for the given Datasource for a given date range Args: uuid: UUID of Datasource start_date: Start date end_date: End date Returns: dict: Dictionary of rank and daterange covered by that rank """ from openghg.util import daterange_overlap, create_daterange_str from collections import defaultdict if uuid not in self._rank_data: return {} search_daterange = create_daterange_str(start=start_date, end=end_date) rank_data = self._rank_data[uuid] ranked = defaultdict(list) # Check if this Datasource is ranked for the dates passed for daterange, rank in rank_data.items(): if daterange_overlap(daterange_a=search_daterange, daterange_b=daterange): ranked[rank].append(daterange) return ranked
def test_daterange_overlap(): start_date_a = "2001-01-01" end_date_a = "2001-06-30" start_date_b = "2001-02-01" end_date_b = "2001-09-01" daterange_a = create_daterange_str(start=start_date_a, end=end_date_a) daterange_b = create_daterange_str(start=start_date_b, end=end_date_b) assert daterange_overlap(daterange_a=daterange_a, daterange_b=daterange_b) is True start_date_b = "2001-07-01" end_date_b = "2001-11-01" daterange_b = create_daterange_str(start=start_date_b, end=end_date_b) assert daterange_overlap(daterange_a=daterange_a, daterange_b=daterange_b) is False
def add_field_data(self, data: Dataset, data_type: str) -> None: """Add footprints data to this Datasource TODO - unsure if add_field_data is the best name for this function Could add a more general function that allows toggle of chunking Args: data: Footprint data in an xarray.Dataset metadata: Metadata data_type: Type of data (footprints, flux, met) Returns: None """ from openghg.util import daterange_overlap # Use a dictionary keyed with the daterange covered by each segment of data new_data = {} # This daterange string covers the whole of the Dataset # For the moment we're not going to chunk footprints daterange_str = self.get_dataset_daterange_str(dataset=data) new_data[daterange_str] = data if self._data: # We don't want the same data twice, this will be stored in previous versions # Check for overlap between exisiting and new dateranges to_keep = [] for current_daterange in self._data: for new_daterange in new_data: if not daterange_overlap(daterange_a=current_daterange, daterange_b=new_daterange): to_keep.append(current_daterange) updated_data = {} for k in to_keep: updated_data[k] = self._data[k] # Add in the additional new data updated_data.update(new_data) self._data = updated_data else: self._data = new_data self._data_type = data_type self.add_metadata_key(key="data_type", value=data_type) self.update_daterange()
def add_timeseries_data(self, data: Dataset) -> None: """Add timeseries data to this Datasource Args: data: An xarray.Dataset Returns: None """ from openghg.util import daterange_overlap # Group by year year_group = data.groupby("time.year") year_data = [data for _, data in year_group if data] # Use a dictionary keyed with the daterange covered by each segment of data additional_data = {} for year in year_data: daterange_str = self.get_dataset_daterange_str(dataset=year) additional_data[daterange_str] = year if self._data: # We don't want the same data twice, this will be stored in previous versions # Check for overlap between exisiting and new dateranges to_keep = [] for current_daterange in self._data: for new_daterange in additional_data: if not daterange_overlap(daterange_a=current_daterange, daterange_b=new_daterange): to_keep.append(current_daterange) updated_data = {} for k in to_keep: updated_data[k] = self._data[k] # Add in the additional new data updated_data.update(additional_data) self._data = updated_data else: self._data = additional_data data_type = "timeseries" self._data_type = data_type self.add_metadata_key(key="data_type", value=data_type) self.update_daterange()
def set_rank( self: T, uuid: str, rank: Union[int, str], date_range: Union[str, List[str]], overwrite: Optional[bool] = False, ) -> None: """Set the rank of a Datasource associated with this object. This function performs checks to ensure multiple ranks aren't set for overlapping dateranges. Passing a daterange and rank to this function will overwrite any current daterange stored for that rank. Args: uuid: UUID of Datasource rank: Rank of data date_range: Daterange(s) overwrite: Overwrite current ranking data Returns: None """ from copy import deepcopy from openghg.util import ( combine_dateranges, daterange_overlap, trim_daterange, daterange_contains, split_encompassed_daterange, sanitise_daterange, ) rank = int(rank) if not 1 <= rank <= 10: raise TypeError("Rank can only take values 1 to 10 (for unranked). Where 1 is the highest rank.") if not isinstance(date_range, list): date_range = [date_range] # Make sure the dateranges passed are correct and are tz-aware date_range = [sanitise_daterange(d) for d in date_range] # Combine in case we have overlappng dateranges date_range = combine_dateranges(date_range) # Used to store dateranges that need to be trimmed to ensure no daterange overlap to_update = [] # Non-overlapping dateranges that can be stored directly to_add = [] if uuid in self._rank_data: rank_data = self._rank_data[uuid] # Check this source isn't ranked differently for the same dates for new_daterange in date_range: overlap = False # Check for overlapping dateranges and add for existing_daterange, existing_rank in rank_data.items(): if daterange_overlap(daterange_a=new_daterange, daterange_b=existing_daterange): overlap = True if rank != existing_rank and overwrite: # Save the daterange we need to update to_update.append((existing_daterange, new_daterange)) continue # If the ranks are the same we just want to combine the dateranges elif rank == existing_rank: to_combine = [new_daterange, existing_daterange] combined = combine_dateranges(dateranges=to_combine)[0] to_update.append((existing_daterange, combined)) else: raise ValueError( f"This datasource has rank {existing_rank} for dates that overlap the ones given. \ Overlapping dateranges are {new_daterange} and {existing_daterange}" ) # Otherwise we just want to add the new daterange to the dict if not overlap: to_add.append(new_daterange) # If we've got dateranges to update and ranks to overwrite we need to trim the # previous ranking daterange down so we don't have overlapping dateranges if to_update: # Here we first take a backup of the old ranking data, update # it and then write it back ranking_backup = deepcopy(rank_data) for existing, new in to_update: # Remove the existing daterange key # Here we pass if it doesn't exist as if we have multiple new dateranges # that overlap the existing daterange it might have been deleted during # a previous iteration try: del ranking_backup[existing] except KeyError: pass # If we want to overwrite an existing rank we need to trim that daterange and # rewrite it back to the dictionary rank_copy = rank_data[existing] if overwrite: if existing == new: ranking_backup[new] = rank_copy # If the existing daterange contains the new daterange # we need to split it into parts and save those elif daterange_contains(container=existing, contained=new): result = split_encompassed_daterange(container=existing, contained=new) existing_start = result["container_start"] ranking_backup[existing_start] = rank_copy updated_new = result["contained"] ranking_backup[updated_new] = rank # We might only end up with two dateranges try: existing_end = result["container_end"] ranking_backup[existing_end] = rank_copy except KeyError: pass # If the new daterange contains the existing we can just overwrite it elif daterange_contains(container=new, contained=existing): ranking_backup[new] = rank else: trimmed = trim_daterange(to_trim=existing, overlapping=new) ranking_backup[trimmed] = rank_copy ranking_backup[new] = rank elif rank_copy == rank: # If we're not overwriting we just need to update to use the new combined ranking_backup[new] = rank_copy self._rank_data[uuid] = ranking_backup # Finally, store the dateranges that didn't overlap for d in to_add: self._rank_data[uuid][d] = rank else: for d in date_range: self._rank_data[uuid][d] = rank self.save()