def trim_daterange(to_trim: str, overlapping: str) -> str: """Removes overlapping dates from to_trim Args: to_trim: Daterange to trim down. Dates that overlap with overlap_daterange will be removed from to_trim overlap_daterange: Daterange containing dates we want to trim from to_trim Returns: str: Trimmed daterange """ from pandas import Timedelta if not daterange_overlap(daterange_a=to_trim, daterange_b=overlapping): raise ValueError( f"Dateranges {to_trim} and {overlapping} do not overlap") # We need to work out which way round they overlap start_trim, end_trim = split_daterange_str(to_trim) start_overlap, end_overlap = split_daterange_str(overlapping) delta_gap = Timedelta("1s") # Work out if to_trim is before or after the overlap_daterange if start_trim < start_overlap and end_overlap > end_trim: new_end_trim = start_overlap - delta_gap return create_daterange_str(start=start_trim, end=new_end_trim) else: new_start_trim = end_overlap + delta_gap return create_daterange_str(start=new_start_trim, end=end_trim)
def daterange_contains(container: str, contained: str) -> bool: """Check if the daterange container contains the daterange contained Args: container: Daterange contained: Daterange Returns: bool """ start_a, end_a = split_daterange_str(container) start_b, end_b = split_daterange_str(contained) return bool(start_a <= start_b and end_b <= end_a)
def closest_daterange(to_compare: str, dateranges: Union[str, List[str]]) -> str: """Finds the closest daterange in a list of dateranges Args: to_compare: Daterange (as a string) to compare dateranges: List of dateranges Returns: str: Daterange from dateranges that's the closest in time to to_compare """ from openghg.util import split_daterange_str from pandas import Timedelta min_start = Timedelta("3650days") min_end = Timedelta("3650days") if not isinstance(dateranges, list): dateranges = [dateranges] dateranges = sorted(dateranges) start_comp, end_comp = split_daterange_str(daterange_str=to_compare) # We want to iterate over the dateranges and first check if they overlap # if they do, return that daterange # otherwise check how far apart the for daterange in dateranges: # If they're close to overlap the start and end will be close start, end = split_daterange_str(daterange_str=daterange) # Check for an overlap if start <= end_comp and end >= start_comp: raise ValueError("Overlapping daterange.") # Find the min between all the starts and all the ends diff_start_end = abs(start_comp - end) if diff_start_end < min_start: min_start = diff_start_end closest_daterange_start = daterange diff_end_start = abs(end_comp - start) if diff_end_start < min_end: min_end = diff_end_start closest_daterange_end = daterange if min_start < min_end: return closest_daterange_start else: return closest_daterange_end
def test_split_daterange_str(): start_true = Timestamp("2001-01-01-00:00:00", tz="UTC") end_true = Timestamp("2001-03-01-00:00:00", tz="UTC") daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00" start, end = split_daterange_str(daterange_str=daterange_1) assert start_true == start assert end_true == end
def update_daterange(self) -> None: """Update the dates stored by this Datasource Returns: None """ from openghg.util import split_daterange_str # If we've only shallow loaded (without the data) # this Datasource we use the latest data keys if not self._data: date_keys = sorted(self._data_keys["latest"]["keys"]) else: date_keys = sorted(self._data.keys()) start, _ = split_daterange_str(daterange_str=date_keys[0]) _, end = split_daterange_str(daterange_str=date_keys[-1]) self._start_date = start self._end_date = end
def sanitise_daterange(daterange: str) -> str: """Make sure the daterange is correct and return tzaware daterange. Args: daterange: Daterange str Returns: str: Timezone aware daterange str """ start, end = split_daterange_str(daterange) if start >= end: raise ValueError("Invalid daterange, start after end date") return create_daterange_str(start=start, end=end)
def valid_daterange(daterange: str) -> bool: """Check if the passed daterange is valid Args: daterange: Daterange string Returns: bool: True if valid """ from openghg.util import split_daterange_str start, end = split_daterange_str(daterange) if start >= end: return False return True
def combine_dateranges(dateranges: List[str]) -> List[str]: """Combine dateranges Args: dateranges: Daterange strings Returns: list: List of combined dateranges Modified from https://codereview.stackexchange.com/a/69249 """ if len(dateranges) == 1: return dateranges def sort_key(tup: Tuple) -> Timestamp: return tup[0] intervals = [split_daterange_str(x) for x in dateranges] sorted_by_lower_bound = sorted(intervals, key=sort_key) combined: List[Timestamp] = [] for higher in sorted_by_lower_bound: if not combined: combined.append(higher) else: lower = combined[-1] # Test for intersection between lower and higher: # We know via sorting that lower[0] <= higher[0] if higher[0] <= lower[1]: upper_bound = max(lower[1], higher[1]) # Replace by combined interval combined[-1] = (lower[0], upper_bound) else: combined.append(higher) combined_strings = [ create_daterange_str(start=a, end=b) for a, b in combined ] return combined_strings
def search(**kwargs): # type: ignore """Search for observations data. Any keyword arguments may be passed to the the function and these keywords will be used to search the metadata associated with each Datasource. Example / commonly used arguments are given below. Args: species: Terms to search for in Datasources locations: Where to search for the terms in species inlet: Inlet height such as 100m instrument: Instrument name such as picarro find_all: Require all search terms to be satisfied start_date: Start datetime for search. If None a start datetime of UNIX epoch (1970-01-01) is set end_date: End datetime for search. If None an end datetime of the current datetime is set skip_ranking: If True skip ranking system, defaults to False Returns: dict: List of keys of Datasources matching the search parameters """ from addict import Dict as aDict from copy import deepcopy from itertools import chain as iter_chain from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel from openghg.store.base import Datasource from openghg.util import ( timestamp_now, timestamp_epoch, timestamp_tzaware, clean_string, closest_daterange, find_daterange_gaps, split_daterange_str, load_json, ) from openghg.dataobjects import SearchResults # Get a copy of kwargs as we make some modifications below kwargs_copy = deepcopy(kwargs) # Do this here otherwise we have to produce them for every datasource start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") if start_date is None: start_date = timestamp_epoch() else: start_date = timestamp_tzaware(start_date) if end_date is None: end_date = timestamp_now() else: end_date = timestamp_tzaware(end_date) kwargs_copy["start_date"] = start_date kwargs_copy["end_date"] = end_date skip_ranking = kwargs_copy.get("skip_ranking", False) try: del kwargs_copy["skip_ranking"] except KeyError: pass # As we might have kwargs that are None we want to get rid of those search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None} # Speices translation species = search_kwargs.get("species") if species is not None: if not isinstance(species, list): species = [species] translator = load_json("species_translator.json") updated_species = [] for s in species: updated_species.append(s) try: translated = translator[s] except KeyError: pass else: updated_species.extend(translated) search_kwargs["species"] = updated_species data_type = search_kwargs.get("data_type", "timeseries") valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model") if data_type not in valid_data_types: raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}") # Assume we want timeseries data obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load() if data_type == "footprints": obj = Footprints.load() elif data_type == "emissions": obj = Emissions.load() elif data_type == "eulerian_model": obj = EulerianModel.load() datasource_uuids = obj.datasources() # Shallow load the Datasources so we can search their metadata datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) # For the time being this will return a dict until we know how best to represent # the footprints and emissions results in a SearchResult object if data_type in {"emissions", "footprints", "eulerian_model"}: sources: Dict = aDict() for datasource in datasources: if datasource.search_metadata(**search_kwargs): uid = datasource.uuid() sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) sources[uid]["metadata"] = datasource.metadata() return sources # Find the Datasources that contain matching metadata matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)} # TODO - Update this as it only uses the ACRG repo JSON at the moment # Check if this site only has one inlet, if so skip ranking # if "site" in search_kwargs: # site = search_kwargs["site"] # if not isinstance(site, list) and not multiple_inlets(site=site): # skip_ranking = True # If there isn't *any* ranking data at all, skip all the ranking functionality if not obj._rank_data: skip_ranking = True # If only one datasource has been returned, skip all the ranking functionality if len(matching_sources) == 1: skip_ranking = True # If we have the site, inlet and instrument then just return the data # TODO - should instrument be added here if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True: specific_sources = aDict() for datasource in matching_sources.values(): specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) if not specific_keys: continue metadata = datasource.metadata() site = metadata["site"] species = metadata["species"] inlet = metadata["inlet"] specific_sources[site][species][inlet]["keys"] = specific_keys specific_sources[site][species][inlet]["metadata"] = metadata return SearchResults(results=specific_sources.to_dict(), ranked_data=False) highest_ranked = aDict() for uid, datasource in matching_sources.items(): # Find the site and then the ranking metadata = datasource.metadata() # Get the site inlet and species site = metadata["site"] species = metadata["species"] rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date) # If this Datasource doesn't have any ranking data skip it and move on if not rank_data: continue # There will only be a single rank key rank_value = next(iter(rank_data)) # Get the daterange this rank covers rank_dateranges = rank_data[rank_value] # Each match we store gives us the information we need # to retrieve the data match = {"uuid": uid, "dateranges": rank_dateranges} # Need to ensure we get all the dates covered if species in highest_ranked[site]: species_rank_data = highest_ranked[site][species] # If we have a higher (lower number) rank save it if rank_value < species_rank_data["rank"]: species_rank_data["rank"] = rank_value species_rank_data["matching"] = [match] # If another Datasource has the same rank for another daterange # we want to save that as well elif rank_value == species_rank_data["rank"]: species_rank_data["matching"].append(match) else: highest_ranked[site][species]["rank"] = rank_value highest_ranked[site][species]["matching"] = [match] if not highest_ranked: raise ValueError( ( "No ranking data set for the given search parameters." " Please refine your search to include a specific site, species and inlet." ) ) # Now we have the highest ranked data the dateranges there are ranks for # we want to fill in the gaps with (currently) the highest inlet from that site # We just want some rank_metadata to go along with the final data scheme # Can key a key of date - inlet data_keys: Dict = aDict() for site, species in highest_ranked.items(): for sp, data in species.items(): # data_keys[site][sp]["keys"] = [] species_keys = [] species_rank_data = {} species_metadata = {} for match_data in data["matching"]: uuid = match_data["uuid"] match_dateranges = match_data["dateranges"] # Get the datasource as it's already in the dictionary # we created earlier datasource = matching_sources[uuid] metadata = datasource.metadata() inlet = metadata["inlet"] keys = [] for dr in match_dateranges: date_keys = datasource.keys_in_daterange_str(daterange=dr) if date_keys: keys.extend(date_keys) # We'll add this to the metadata in the search results we return at the end species_rank_data[dr] = inlet species_keys.extend(keys) species_metadata[inlet] = metadata # Only create the dictionary keys if we have some data keys if species_keys: data_keys[site][sp]["keys"] = species_keys data_keys[site][sp]["rank_metadata"] = species_rank_data data_keys[site][sp]["metadata"] = species_metadata else: continue # We now need to retrieve data for the dateranges for which we don't have ranking data # To do this find the gaps in the daterange over which the user has requested data # and the dates for which we have ranking information # Get the dateranges that are covered by ranking information daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]])) # Find the gaps in the ranking coverage gap_dateranges = find_daterange_gaps( start_search=start_date, end_search=end_date, dateranges=daterange_strs ) # We want the dateranges and inlets for those dateranges inlet_dateranges = data_keys[site][sp]["rank_metadata"] # These are the dateranges for which we have ranking information for this site and species ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys()) for gap_daterange in gap_dateranges: # We want to select the inlet that's ranked for dates closest to the ones we have here closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges) gap_start, gap_end = split_daterange_str(gap_daterange) # Find the closest ranked inlet by date chosen_inlet = inlet_dateranges[closest_dr] inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet] inlet_instrument = inlet_metadata["instrument"] inlet_sampling_period = inlet_metadata["sampling_period"] # Then we want to retrieve the correct metadata for those inlets results: SearchResults = search( site=site, species=sp, inlet=chosen_inlet, instrument=inlet_instrument, sampling_period=inlet_sampling_period, start_date=gap_start, end_date=gap_end, ) # type: ignore if not results: continue # Retrieve the data keys inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet) data_keys[site][sp]["keys"].extend(inlet_data_keys) # Remove any duplicate keys data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"])) # TODO - create a stub for addict dict_data_keys = data_keys.to_dict() # type: ignore return SearchResults(results=dict_data_keys, ranked_data=True)
def split_encompassed_daterange(container: str, contained: str) -> Dict: """Checks if one of the passed dateranges contains the other, if so, then split the larger daterange into three sections. <---a---> <---------b-----------> Here b is split into three and we end up with: <-dr1-><---a---><-dr2-> Args: daterange_a: Daterange daterange_b: Daterange Returns: dict: Dictionary of results """ from pandas import Timedelta container_start, container_end = split_daterange_str( daterange_str=container) contained_start, contained_end = split_daterange_str( daterange_str=contained) # First check one contains the other if not (container_start <= contained_start and contained_end <= container_end): raise ValueError(f"Range {container} does not contain {contained}") # Gap to add between dateranegs so they don't overlap delta_gap = Timedelta("1s") # If the difference is less than this we'll assume they're the same timestamp tolerance = Timedelta("2h") results = {} # If one of them starts at the same point we just want to split the range in two if abs(contained_start - container_start) < tolerance: new_contained = create_daterange_str(start=contained_start, end=contained_end) dr1_start = contained_end + delta_gap dr1 = create_daterange_str(start=dr1_start, end=container_end) results["container_start"] = dr1 results["contained"] = new_contained return results if abs(contained_end - container_end) < tolerance: new_contained = create_daterange_str(start=contained_start, end=contained_end) dr1_end = contained_start - delta_gap dr1 = create_daterange_str(start=container_start, end=dr1_end) results["container_start"] = dr1 results["contained"] = new_contained return results dr1_start = container_start dr1_end = contained_start - delta_gap dr1 = create_daterange_str(start=dr1_start, end=dr1_end) dr3_start = contained_end + delta_gap dr3_end = container_end dr3 = create_daterange_str(start=dr3_start, end=dr3_end) # Trim a gap off the end of contained new_contained_end = contained_end - delta_gap new_contained = create_daterange_str(start=contained_start, end=new_contained_end) results["container_start"] = dr1 results["contained"] = new_contained results["container_end"] = dr3 return results
def find_daterange_gaps(start_search: Timestamp, end_search: Timestamp, dateranges: List) -> List[str]: """Given a start and end date and a list of dateranges find the gaps. For example given a list of dateranges example = ['2014-09-02_2014-11-01', '2016-09-02_2018-11-01'] start = timestamp_tzaware("2012-01-01") end = timestamp_tzaware("2019-09-01") gaps = find_daterange_gaps(start, end, example) gaps == ['2012-01-01-00:00:00+00:00_2014-09-01-00:00:00+00:00', '2014-11-02-00:00:00+00:00_2016-09-01-00:00:00+00:00', '2018-11-02-00:00:00+00:00_2019-09-01-00:00:00+00:00'] Args: start_search: Start timestamp end_search: End timestamp dateranges: List of daterange strings Returns: list: List of dateranges """ from pandas import Timedelta from openghg.util import pairwise sorted_dateranges = sorted(dateranges) # The difference between the start and end of the successived dateranges range_gap = "1day" # First find the gap between the start and the end start_first, end_first = split_daterange_str(sorted_dateranges[0]) gaps = [] if start_search < start_first: gap_start = start_search gap_end = start_first - Timedelta(range_gap) gap = create_daterange_str(start=gap_start, end=gap_end) gaps.append(gap) # Then find the gap between the end start_last, end_last = split_daterange_str(sorted_dateranges[-1]) if end_search > end_last: gap_end = end_search gap_start = end_last + Timedelta(range_gap) gap = create_daterange_str(start=gap_start, end=gap_end) gaps.append(gap) for a, b in pairwise(sorted_dateranges): start_a, end_a = split_daterange_str(a) start_b, end_b = split_daterange_str(b) # Ignore any that are outside our search window if end_a < start_search or start_a > end_search: continue diff = start_b - end_a if diff > Timedelta(range_gap) and diff.value > 0: gap_start = end_a + Timedelta(range_gap) gap_end = start_b - Timedelta(range_gap) gap = create_daterange_str(start=gap_start, end=gap_end) gaps.append(gap) else: pass gaps.sort() return gaps