def test_combining_overlapping_dateranges(): d = Datasource() daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00" daterange_2 = "2001-02-01-00:00:00_2001-06-01-00:00:00" dateranges = [daterange_1, daterange_2] combined = d.combine_dateranges(dateranges=dateranges) assert combined == ['2001-01-01-00:00:00+00:00_2001-06-01-00:00:00+00:00'] daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00" daterange_2 = "2001-02-01-00:00:00_2001-06-01-00:00:00" daterange_3 = "2001-05-01-00:00:00_2001-08-01-00:00:00" daterange_4 = "2004-05-01-00:00:00_2004-08-01-00:00:00" daterange_5 = "2004-04-01-00:00:00_2004-09-01-00:00:00" daterange_6 = "2007-04-01-00:00:00_2007-09-01-00:00:00" dateranges = [daterange_1, daterange_2, daterange_3, daterange_4, daterange_5, daterange_6] combined = d.combine_dateranges(dateranges=dateranges) assert combined == ['2001-01-01-00:00:00+00:00_2001-08-01-00:00:00+00:00', '2004-04-01-00:00:00+00:00_2004-09-01-00:00:00+00:00', '2007-04-01-00:00:00+00:00_2007-09-01-00:00:00+00:00']
def test_set_incorrect_rank_raises(): d = Datasource() daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00" with pytest.raises(TypeError): d.set_rank(rank=42, daterange=daterange)
def test_add_data(data): d = Datasource(name="test") metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] assert ch4_data["ch4"][0] == pytest.approx(1960.24) assert ch4_data["ch4 stdev"][0] == pytest.approx(0.236) assert ch4_data["ch4 n_meas"][0] == pytest.approx(26.0) d.add_data(metadata=metadata, data=ch4_data) date_key = "2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00" assert d._data[date_key]["ch4"].equals(ch4_data["ch4"]) assert d._data[date_key]["ch4 stdev"].equals(ch4_data["ch4 stdev"]) assert d._data[date_key]["ch4 n_meas"].equals(ch4_data["ch4 n_meas"]) datasource_metadata = d.metadata() assert datasource_metadata["data_type"] == "timeseries" assert datasource_metadata["inlet"] == "248m" assert datasource_metadata["instrument"] == "picarro" assert datasource_metadata["port"] == "8" assert datasource_metadata["site"] == "bsd" assert datasource_metadata["species"] == "ch4"
def test_incorrect_datatype_raises(data): d = Datasource(name="testing_123") metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] with pytest.raises(TypeError): d.add_data(metadata=metadata, data=ch4_data, data_type="CRDS")
def test_set_rank(): d = Datasource() daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00" d.set_rank(rank=1, daterange=daterange) assert d._rank[1] == ['2027-08-01-00:00:00_2027-12-01-00:00:00']
def test_combining_single_dateranges_returns(): d = Datasource() daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00" combined = d.combine_dateranges(dateranges=[daterange]) assert combined[0] == daterange
def test_combining_no_overlap(): d = Datasource() daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00" daterange_2 = "2011-02-01-00:00:00_2011-06-01-00:00:00" dateranges = [daterange_1, daterange_2] combined = d.combine_dateranges(dateranges=dateranges) assert combined == ['2001-01-01-00:00:00+00:00_2001-03-01-00:00:00+00:00', '2011-02-01-00:00:00+00:00_2011-06-01-00:00:00+00:00']
def test_search_metadata_find_all(): d = Datasource(name="test_search") d._metadata = {"inlet": "100m", "instrument": "violin", "car": "toyota"} result = d.search_metadata(search_terms=["100m", "violin", "toyota"], find_all=True) assert result is True result = d.search_metadata(search_terms=["100m", "violin", "toyota", "swallow"], find_all=True) assert result is False
def test_split_daterange_str(): d = Datasource() start_true = pd.Timestamp("2001-01-01-00:00:00", tz="UTC") end_true = pd.Timestamp("2001-03-01-00:00:00", tz="UTC") daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00" start, end = d.split_datrange_str(daterange_str=daterange_1) assert start_true == start assert end_true == end
def assign_data(gas_data, lookup_results, overwrite): """ Create or get an existing Datasource for each gas in the file Args: gas_data (dict): Dictionary containing data and metadata for species Returns: dict: Dictionary of UUIDs of Datasources data has been assigned to keyed by species name """ from HUGS.Modules import Datasource uuids = {} # Add in copying of attributes, or add attributes to the metadata at an earlier state. for species in gas_data: metadata = gas_data[species]["metadata"] data = gas_data[species]["data"] name = lookup_results[species]["name"] uuid = lookup_results[species]["uuid"] # If we have a UUID for this Datasource load the existing object # from the object store if uuid: datasource = Datasource.load(uuid=uuid) else: datasource = Datasource(name=name) # Add the dataframe to the datasource datasource.add_data(metadata=metadata, data=data, overwrite=overwrite) # Save Datasource to object store datasource.save() uuids[name] = datasource.uuid() return uuids
def test_get_dataframe_daterange(): n_days = 100 epoch = datetime.datetime(1970, 1, 1, 1, 1) random_data = pd.DataFrame( data=np.random.randint(0, 100, size=(100, 4)), index=pd.date_range(epoch, epoch + datetime.timedelta(n_days - 1), freq="D"), columns=list("ABCD"), ) d = Datasource(name="test") start, end = d.get_dataframe_daterange(random_data) assert start == pd.Timestamp("1970-01-01 01:01:00+0000") assert end == pd.Timestamp("1970-04-10 01:01:00+0000")
def test_read_CRDS(): get_local_bucket(empty=True) filepath = get_datapath(filename="bsd.picarro.1minute.248m.dat", data_type="CRDS") results = ObsSurface.read_file(filepath=filepath, data_type="CRDS") keys = results["bsd.picarro.1minute.248m.dat"].keys() expected_keys = sorted([ "bsd.picarro.1minute.248m_ch4", "bsd.picarro.1minute.248m_co", "bsd.picarro.1minute.248m_co2", ]) assert sorted(keys) == expected_keys # Load up the assigned Datasources and check they contain the correct data data = results["bsd.picarro.1minute.248m.dat"] ch4_data = Datasource.load( uuid=data["bsd.picarro.1minute.248m_ch4"]).data() ch4_data = ch4_data["2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00"] assert ch4_data.time[0] == Timestamp("2014-01-30T10:52:30") assert ch4_data["ch4"][0] == 1960.24 assert ch4_data["ch4"][-1] == 1952.24 assert ch4_data["ch4_stdev"][-1] == 0.674 assert ch4_data["ch4_n_meas"][-1] == 25.0 obs = ObsSurface.load() assert sorted(obs._datasource_names.keys()) == expected_keys
def test_read_noaa(): get_local_bucket(empty=True) data_filepath = get_datapath( filename="co_pocn25_surface-flask_1_ccgg_event.txt", data_type="NOAA") results = ObsSurface.read_file(filepath=data_filepath, data_type="NOAA") uuid = results["co_pocn25_surface-flask_1_ccgg_event.txt"][ "co_pocn25_surface-flask_1_ccgg_event_co"] co_data = Datasource.load(uuid=uuid, shallow=False).data() assert len(co_data.keys()) == 95 old_data = co_data["1990-12-02-12:23:00+00:00_1990-12-02-12:23:00+00:00"] assert old_data.time[0] == Timestamp("1990-12-02T12:23:00") assert old_data.time[-1] == Timestamp("1990-12-02T12:23:00") assert old_data["co"][0] == 141.61 assert old_data["co"][-1] == 141.61 assert old_data["co_repeatability"][0] == -999.99 assert old_data["co_repeatability"][-1] == -999.99 assert old_data["co_selection_flag"][0] == 0 assert old_data["co_selection_flag"][-1] == 0 obs = ObsSurface.load() assert list(obs._datasource_names.keys() )[0] == "co_pocn25_surface-flask_1_ccgg_event_co"
def test_read_thames_barrier(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") results = ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER") expected_keys = sorted([ 'thames_test_20190707_CH4', 'thames_test_20190707_CO2', 'thames_test_20190707_CO' ]) assert sorted(list( results["thames_test_20190707.csv"].keys())) == expected_keys uuid = results["thames_test_20190707.csv"]["thames_test_20190707_CO2"] data = Datasource.load(uuid=uuid, shallow=False).data() data = data["2019-07-01-00:39:55+00:00_2019-08-01-00:10:30+00:00"] assert data.time[0] == Timestamp("2019-07-01T00:39:55") assert data.time[-1] == Timestamp("2019-08-01T00:10:30") assert data["co2"][0] == pytest.approx(417.97344761) assert data["co2"][-1] == pytest.approx(417.80000653) assert data["co2_variability"][0] == 0 assert data["co2_variability"][-1] == 0 obs = ObsSurface.load() assert sorted(obs._datasource_names.keys()) == expected_keys
def test_delete_Datasource(): bucket = get_local_bucket(empty=True) data_filepath = get_datapath(filename="tta.co2.1minute.222m.min.dat", data_type="ICOS") ObsSurface.read_file(filepath=data_filepath, data_type="ICOS") obs = ObsSurface.load() datasources = obs.datasources() uuid = datasources[0] datasource = Datasource.load(uuid=uuid) data = datasource.data( )["2011-12-07-01:38:00+00:00_2011-12-31-19:57:00+00:00"] assert data["co2"][0] == pytest.approx(397.334) assert data.time[0] == Timestamp("2011-12-07T01:38:00") data_keys = datasource.data_keys() key = data_keys[0] assert exists(bucket=bucket, key=key) obs.delete(uuid=uuid) assert uuid not in obs.datasources() assert not exists(bucket=bucket, key=key)
def query_store(): """ Create a dictionary that can be used to visualise the object store Returns: dict: Dictionary of data ? """ from collections import defaultdict from HUGS.Modules import Datasource, ObsSurface obs = ObsSurface.load() datasource_uuids = obs.datasources() datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) data = defaultdict(dict) for d in datasources: metadata = d.metadata() result = { "site": metadata["site"], "species": metadata["species"], "instrument": metadata.get("instrument", "Unknown"), "network": metadata.get("network") } data[d.uuid()] = result return data # def visualise_store(): # """ Visualise the output of the # """
def test_save_footprint(): bucket = get_local_bucket(empty=True) metadata = {"test": "testing123"} dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filename = "WAO-20magl_EUROPE_201306_downsampled.nc" filepath = os.path.join(dir_path, test_data, filename) data = xarray.open_dataset(filepath) datasource = Datasource(name="test_name") datasource.add_data(metadata=metadata, data=data, data_type="footprint") datasource.save() prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) datasource_2 = Datasource.load(bucket=bucket, key=objs[0]) date_key = "2013-06-02-00:00:00+00:00_2013-06-30-00:00:00+00:00" data = datasource_2._data[date_key] assert float(data.pressure[0].values) == pytest.approx(1023.971) assert float(data.pressure[2].values) == pytest.approx(1009.940) assert float(data.pressure[-1].values) == pytest.approx(1021.303)
def test_load_dataset(): filename = "WAO-20magl_EUROPE_201306_small.nc" dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filepath = os.path.join(dir_path, test_data, filename) ds = xarray.load_dataset(filepath) metadata = {"some": "metadata"} d = Datasource("dataset_test") d.add_data(metadata=metadata, data=ds, data_type="footprint") d.save() keys = d._data_keys["latest"]["keys"] key = list(keys.values())[0] bucket = get_local_bucket() loaded_ds = Datasource.load_dataset(bucket=bucket, key=key) assert loaded_ds.equals(ds)
def test_search_metadata(): d = Datasource(name="test_search") d._metadata = {"unladen": "swallow", "spam": "eggs"} assert d.search_metadata("swallow") == True assert d.search_metadata("eggs") == True assert d.search_metadata("eggs") == True assert d.search_metadata("Swallow") == True assert d.search_metadata("beans") == False assert d.search_metadata("flamingo") == False
def test_in_daterange(data): metadata = data["ch4"]["metadata"] data = data["ch4"]["data"] d = Datasource() d.add_data(metadata=metadata, data=data) d.save() start = pd.Timestamp("2014-1-1") end = pd.Timestamp("2014-2-1") daterange = create_daterange_str(start=start, end=end) d._data_keys["latest"]["2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00'] d._data_keys["latest"]["2015-01-30-10:52:30+00:00_2016-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2015-01-30-10:52:30+00:00_2016-01-30-14:20:30+00:00'] d._data_keys["latest"]["2016-01-31-10:52:30+00:00_2017-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2016-01-31-10:52:30+00:00_2017-01-30-14:20:30+00:00'] keys = d.in_daterange(daterange=daterange) assert keys[0].split("/")[-1] == '2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00'
def test_exists(): d = Datasource(name="testing") d.save() exists = Datasource.exists(datasource_id=d.uuid()) assert exists == True
def set_rank(self, uuid, rank, daterange): """ Set the rank of a Datasource associated with this object. This function performs checks to ensure multiple ranks aren't set for overlapping dateranges. Passing a daterange and rank to this function will overwrite any current daterange stored for that rank. Args: uuid (str): UUID of Datasource rank (int): Rank of data daterange (str, list): Daterange(s) Returns: None """ from HUGS.Modules import Datasource from HUGS.Util import daterange_from_str if not 0 <= int(rank) <= 10: raise TypeError("Rank can only take values 0 (for unranked) to 10. Where 1 is the highest rank.") if not isinstance(daterange, list): daterange = [daterange] try: rank_data = self._rank_data[uuid] # Check this source isn't ranked differently for the same dates for d in daterange: # Check we don't have any overlapping dateranges for other ranks daterange_obj = daterange_from_str(d) # Check the other dateranges for overlapping dates and raise error for existing_rank, existing_daterange in rank_data.items(): for e in existing_daterange: e = daterange_from_str(e) intersection = daterange_obj.intersection(e) if len(intersection) > 0 and int(existing_rank) != int(rank): raise ValueError(f"This datasource has already got the rank {existing_rank} for dates that overlap the ones given. \ Overlapping dates are {intersection}") except KeyError: pass # Store the rank within the Datasource datasource = Datasource.load(uuid=uuid, shallow=True) datasource.set_rank(rank=rank, daterange=daterange) datasource.save() try: self._rank_data[uuid][rank].extend(daterange) except KeyError: self._rank_data[uuid][rank] = daterange
def get_data(key_list): """ Gets data from the Datasources found by the search function Bypass loading the Datasource? Get both then we have metadata? """ from HUGS.Modules import Datasource # Get the data # This will return a list of lists of data # Maybe want to do some preprocessing on this data before it comes raw out of the object store? # We only want the data in the correct daterange return [Datasource.load(key=key)._data for key in key_list]
def test_to_data(data): d = Datasource(name="testing_123") metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] assert ch4_data["ch4"][0] == pytest.approx(1960.24) assert ch4_data["ch4 stdev"][0] == pytest.approx(0.236) assert ch4_data["ch4 n_meas"][0] == pytest.approx(26.0) d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") obj_data = d.to_data() metadata = obj_data["metadata"] assert obj_data["name"] == "testing_123" assert metadata["site"] == "bsd" assert metadata["instrument"] == "picarro" assert metadata["time_resolution"] == "1_minute" assert metadata["inlet"] == "248m" assert obj_data["data_type"] == "timeseries" assert len(obj_data["data_keys"]) == 0
def test_save(mock_uuid2): bucket = get_local_bucket() datasource = Datasource(name="test_name") datasource.add_metadata(key="data_type", value="timeseries") datasource.save(bucket) prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) assert objs[0].split("/")[-1] == mocked_uuid2
def get_sources(args): """ Get the Datasources associated with the specified species at a specified site Args: args (dict): Dictionary containing site and species keys Returns: dict: Dictionary of """ try: site = args["site"] except KeyError: # TODO - created a SiteError error type to raise here raise KeyError("Site must be specified") try: species = args["species"] except KeyError: raise KeyError("Species must be specified") obs = ObsSurface.load() datasource_uuids = obs.datasources() # Shallow load the Datasources (only get their JSON metadata) datasources = [ Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids ] matching_sources = [ d for d in datasources if d.search_metadata(search_terms=[site, species], find_all=True) ] def name_str(d): return "_".join([d.species(), d.site(), d.inlet(), d.instrument()]) unranked = { name_str(d): { "rank": d.rank(), "data_range": d.daterange_str(), "uuid": d.uuid() } for d in matching_sources } return unranked
def get_sources(self, site, species, data_type): """ Get the datasources for this site and species to allow a ranking to be set Args: site (str): Three letter site code species (str): Species name data_type (str): Must be valid datatype i.e. CRDS, GC See all valid datasources in the DataTypes class Returns: dict: Dictionary of datasource metadata """ if len(site) != 3 or not valid_site(site): # raise InvalidSiteError(f"{site} is not a valid site code") raise ValueError(f"{site} is not a valid site code") obs = ObsSurface.load() datasource_uuids = obs.datasources() # Shallow load the Datasources (only get their JSON metadata) datasources = [ Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids ] matching_sources = [ d for d in datasources if d.search_metadata(search_terms=[site, species], find_all=True) ] def name_str(d): return "_".join([d.species(), d.site(), d.inlet(), d.instrument()]) rank_info = { name_str(d): { "rank": d.rank(), "data_range": d.daterange_str(), "uuid": d.uuid() } for d in matching_sources } self._before_ranking = copy.deepcopy(rank_info) self._key_uuids = {key: rank_info[key]["uuid"] for key in rank_info} return rank_info
def test_setting_overlapping_dateranges(): d = Datasource() daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00" d.set_rank(rank=1, daterange=daterange) assert d._rank[1] == ['2027-08-01-00:00:00_2027-12-01-00:00:00'] daterange_two = "2027-11-01-00:00:00_2028-06-01-00:00:00" d.set_rank(rank=1, daterange=daterange_two) assert d._rank[1] == ['2027-08-01-00:00:00+00:00_2028-06-01-00:00:00+00:00']
def assign_data(self, lookup_results, source_name, data, metadata, overwrite=False): """ Assign data to a new or existing Datasource Args: lookup_results (dict): Results of Datasource lookup source_name (str): Name of data source data (xarray.Dataset): Data metadata (dict): Dictionary of metadata overwrite (bool, default=False): Should exisiting data be overwritten Returns: list: List of Datasource UUIDs """ from HUGS.Modules import Datasource uuids = {} for key in lookup_results: uuid = lookup_results[key]["uuid"] name = metadata["name"] if uuid: datasource = Datasource.load(uuid=uuid) else: datasource = Datasource(name=name) datasource.add_data(metadata=metadata, data=data, data_type="footprint") datasource.save() uuids[name] = datasource.uuid() return uuids
def test_update_daterange_replacement(data): metadata = {"foo": "bar"} d = Datasource(name="foo") ch4_data = data["ch4"]["data"] d.add_data(metadata=metadata, data=ch4_data) assert d._start_datetime == pd.Timestamp("2014-01-30 10:52:30+00:00") assert d._end_datetime == pd.Timestamp("2014-01-30 14:20:30+00:00") ch4_short = ch4_data.head(40) d._data = None d.add_data(metadata=metadata, data=ch4_short, overwrite=True) assert d._start_datetime == pd.Timestamp("2014-01-30 10:52:30+00:00") assert d._end_datetime == pd.Timestamp("2014-01-30 13:22:30+00:00")