def load(bucket=None, uuid=None, key=None, shallow=False): """ Load a Datasource from the object store either by name or UUID uuid or name must be passed to the function Args: bucket (dict, default=None): Bucket to store object uuid (str, default=None): UID of Datasource name (str, default=None): Name of Datasource shallow (bool, default=False): Only load JSON data, do not read Datasets from object store. This will speed up creation of the Datasource object. Returns: Datasource: Datasource object created from JSON """ from HUGS.ObjectStore import get_bucket, get_object_from_json if uuid is None and key is None: raise ValueError("Both uuid and key cannot be None") if bucket is None: bucket = get_bucket() if key is None: key = f"{Datasource._datasource_root}/uuid/{uuid}" data = get_object_from_json(bucket=bucket, key=key) return Datasource.from_data(bucket=bucket, data=data, shallow=shallow)
def from_data(cls, data, bucket=None): """ Create an object from data Args: data (str): JSON data bucket (dict, default=None): Bucket for data storage Returns: cls: Class object of cls type """ from Acquire.ObjectStore import string_to_datetime from HUGS.ObjectStore import get_bucket from collections import defaultdict if not data: raise ValueError("Unable to create object with empty dictionary") if bucket is None: bucket = get_bucket() c = cls() c._creation_datetime = string_to_datetime(data["creation_datetime"]) c._datasource_uuids = data["datasource_uuids"] c._datasource_names = data["datasource_names"] c._file_hashes = data["file_hashes"] try: c._rank_data = defaultdict(dict, data["rank_data"]) except KeyError: c._rank_data = defaultdict(dict) c._stored = False return c
def save(self, bucket=None): """ Save this Datasource object as JSON to the object store Args: bucket (str, default=None): Bucket to hold data Returns: None """ import tempfile from copy import deepcopy from Acquire.ObjectStore import get_datetime_now_to_string from HUGS.ObjectStore import get_bucket, set_object_from_file, set_object_from_json if bucket is None: bucket = get_bucket() if self._data: # Ensure we have the latest key if "latest" not in self._data_keys: self._data_keys["latest"] = {} # Backup the old data keys at "latest" version_str = f"v{str(len(self._data_keys))}" # Store the keys for the new data new_keys = {} # Iterate over the keys (daterange string) of the data dictionary for daterange in self._data: data_key = f"{Datasource._data_root}/uuid/{self._uuid}/{version_str}/{daterange}" new_keys[daterange] = data_key data = self._data[daterange] # TODO - for now just create a temporary directory - will have to update Acquire # or work on a PR for xarray to allow returning a NetCDF as bytes with tempfile.TemporaryDirectory() as tmpdir: filepath = f"{tmpdir}/temp.nc" data.to_netcdf(filepath) set_object_from_file(bucket=bucket, key=data_key, filename=filepath) # Copy the last version if "latest" in self._data_keys: self._data_keys[version_str] = deepcopy(self._data_keys["latest"]) # Save the new keys and create a timestamp self._data_keys[version_str]["keys"] = new_keys self._data_keys[version_str]["timestamp"] = get_datetime_now_to_string() # Link latest to the newest version self._data_keys["latest"] = self._data_keys[version_str] self._latest_version = version_str self._stored = True datasource_key = f"{Datasource._datasource_root}/uuid/{self._uuid}" set_object_from_json(bucket=bucket, key=datasource_key, data=self.to_data())
def listobjects(args): try: prefix = args["prefix"] except KeyError: prefix = None bucket = get_bucket() results = get_object_names(bucket=bucket, prefix=prefix) return {"results": results}
def remove_objects(args): from HUGS.ObjectStore import get_bucket, delete_object if "keys" in args: keys = args["keys"] if not keys: raise ValueError("No keys in list") else: raise KeyError("No keys found") bucket = get_bucket() for key in keys: delete_object(bucket=bucket, key=key)
def exists(datasource_id, bucket=None): """ Check if a datasource with this ID is already stored in the object store Args: datasource_id (str): ID of datasource created from data Returns: bool: True if Datasource exists """ from HUGS.ObjectStore import exists, get_bucket if bucket is None: bucket = get_bucket() key = f"{Datasource._datasource_root}/uuid/{datasource_id}" return exists(bucket=bucket, key=key)
def exists(cls, bucket=None): """ Check if a GC object is already saved in the object store Args: bucket (dict, default=None): Bucket for data storage Returns: bool: True if object exists """ from HUGS.ObjectStore import exists, get_bucket if bucket is None: bucket = get_bucket() key = f"{cls._root}/uuid/{cls._uuid}" return exists(bucket=bucket, key=key)
def save(self, bucket=None): """ Save the object to the object store Args: bucket (dict, default=None): Bucket for data Returns: None """ from HUGS.ObjectStore import get_bucket, set_object_from_json if bucket is None: bucket = get_bucket() key = f"{TEMPLATE._root}/uuid/{TEMPLATE._uuid}" self._stored = True set_object_from_json(bucket=bucket, key=key, data=self.to_data())
def exists(bucket=None): """ Check if a Footprint object is already saved in the object store Args: bucket (dict, default=None): Bucket for data storage Returns: bool: True if object exists """ from HUGS.ObjectStore import exists, get_bucket if bucket is None: bucket = get_bucket() key = "%s/uuid/%s" % (Footprint._footprint_root, Footprint._footprint_uuid) return exists(bucket=bucket, key=key)
def save(self, bucket=None): """ Save this Footprint object in the object store Args: bucket (dict): Bucket for data storage Returns: None """ from HUGS.ObjectStore import get_bucket, set_object_from_json if self.is_null(): return if bucket is None: bucket = get_bucket() self._stored = True key = f"{Footprint._footprint_root}/uuid/{Footprint._footprint_uuid}" set_object_from_json(bucket=bucket, key=key, data=self.to_data())
def load(bucket=None): """ Load a Footprint object from the object store Args: bucket (dict, default=None): Bucket to store object Returns: Datasource: Datasource object created from JSON """ from HUGS.ObjectStore import get_bucket, get_object_from_json if not Footprint.exists(): return Footprint.create() if bucket is None: bucket = get_bucket() key = f"{Footprint._footprint_root}/uuid/{Footprint._footprint_uuid}" data = get_object_from_json(bucket=bucket, key=key) return Footprint.from_data(data=data, bucket=bucket)
def load(cls, bucket=None): """ Load an object from the datastore using the passed bucket and UUID Args: inst (CRDS): CRDS instance bucket (dict, default=None): Bucket to store object Returns: Datasource: Datasource object created from JSON """ from HUGS.ObjectStore import get_bucket, get_object_from_json if not cls.exists(): return cls() if bucket is None: bucket = get_bucket() key = f"{cls._root}/uuid/{cls._uuid}" data = get_object_from_json(bucket=bucket, key=key) return cls.from_data(data=data, bucket=bucket)
def recombine_sections(data_keys): """ Combines separate dataframes into a single dataframe for processing to NetCDF for output Args: data_keys (list): Dictionary of object store keys keyed by search term Returns: Pandas.Dataframe or list: Combined dataframes """ # from pandas import concat as _concat from xarray import concat as xr_concat from HUGS.ObjectStore import get_bucket from HUGS.Modules import Datasource bucket = get_bucket() data = [Datasource.load_dataset(bucket=bucket, key=k) for k in data_keys] combined = xr_concat(data, dim="time") combined = combined.sortby("time") # Check for duplicates? # This is taken from https://stackoverflow.com/questions/51058379/drop-duplicate-times-in-xarray # _, index = np.unique(f['time'], return_index=True) # f.isel(time=index) # Check that the dataframe's index is sorted by date # if not combined.time.is_monotonic_increasing: # combined = combined.sortby("time") # if not combined.index.is_unique: # raise ValueError("Dataframe index is not unique") return combined
def delete(self, uuid): """ Delete a Datasource with the given UUID This function deletes both the record of the object store in he Args: uuid (str): UUID of Datasource Returns: None """ from HUGS.ObjectStore import delete_object, get_bucket from HUGS.Modules import Datasource bucket = get_bucket() # Load the Datasource and get all its keys # iterate over these keys and delete them datasource = Datasource.load(uuid=uuid) data_keys = datasource.data_keys(return_all=True) for version in data_keys: key_data = data_keys[version]["keys"] for daterange in key_data: key = key_data[daterange] delete_object(bucket=bucket, key=key) # Then delete the Datasource itself key = f"{Datasource._datasource_root}/uuid/{uuid}" delete_object(bucket=bucket, key=key) # First remove from our dictionary of Datasources name = self._datasource_uuids[uuid] del self._datasource_names[name] del self._datasource_uuids[uuid]