class IndexGEETestCase(unittest.TestCase): def setUp(self): self.datacube = Datacube(config=DATACUBE_CONFIG) IndexerTestCase().test_product_generation() product = self.datacube.index.products.get_by_name('ls8_test') if product is None: self.skipTest('No product available to index') datasets = self.datacube.find_datasets(product='ls8_test') if datasets: self.skipTest('Indexed datasets already exist in database') def test_index_gee(self): product = 'ls8_test' latitude = (-4.15, -3.90) longitude = (39.50, 39.75) time = '2020-01' cmd = [ "index_gee", "--product", product, "--latitude", str(latitude), "--longitude", str(longitude), "--time", time, "--config", DATACUBE_CONFIG, "--no_confirm", "-u" ] subprocess.check_output(cmd) datasets = self.datacube.find_datasets(product=product) self.assertGreater(len(datasets), 0, 'Expected to find datasets in index')
def test_query_dataset_multi_product(index: Index, ls5_dataset_w_children: Dataset): # We have one ls5 level1 and its child nbar dc = Datacube(index) # Can we query a single product name? datasets = dc.find_datasets(product='ls5_nbar_scene') assert len(datasets) == 1 # Can we query multiple products? datasets = dc.find_datasets(product=['ls5_nbar_scene', 'ls5_level1_scene']) assert len(datasets) == 2 # Can we query multiple products in a tuple datasets = dc.find_datasets(product=('ls5_nbar_scene', 'ls5_level1_scene')) assert len(datasets) == 2
def check_data_with_api(index, time_slices): """Chek retrieved data for specific values. We scale down by 100 and check for predefined values in the corners. """ from datacube import Datacube dc = Datacube(index=index) # Make the retrieved data 100 less granular shape_x = int(GEOTIFF['shape']['x'] / 100.0) shape_y = int(GEOTIFF['shape']['y'] / 100.0) pixel_x = int(GEOTIFF['pixel_size']['x'] * 100) pixel_y = int(GEOTIFF['pixel_size']['y'] * 100) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = geometry.GeoBox( shape_x + 1, shape_y + 1, Affine(pixel_x, 0.0, GEOTIFF['ul']['x'], 0.0, pixel_y, GEOTIFF['ul']['y']), geometry.CRS(GEOTIFF['crs'])) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values()) assert hashlib.md5( data.green.data).hexdigest() == '7f5ace486e88d33edf3512e8de6b6996' assert hashlib.md5( data.blue.data).hexdigest() == 'b58204f1e10dd678b292df188c242c7e' for time_slice in range(time_slices): assert data.blue.values[time_slice][-1, -1] == -999
def __call__(self, index, product, time, group_by) -> Tile: # Do for a specific poly whose boundary is known output_crs = CRS(self.storage['crs']) filtered_items = [ 'geopolygon', 'lon', 'lat', 'longitude', 'latitude', 'x', 'y' ] filtered_dict = { k: v for k, v in self.input_region.items() if k in filtered_items } if self.feature is not None: filtered_dict['geopolygon'] = self.feature.geopolygon geopoly = filtered_dict['geopolygon'] else: geopoly = query_geopolygon(**self.input_region) dc = Datacube(index=index) datasets = dc.find_datasets(product=product, time=time, group_by=group_by, **filtered_dict) group_by = query_group_by(group_by=group_by) sources = dc.group_datasets(datasets, group_by) output_resolution = [ self.storage['resolution'][dim] for dim in output_crs.dimensions ] geopoly = geopoly.to_crs(output_crs) geobox = GeoBox.from_geopolygon(geopoly, resolution=output_resolution) return Tile(sources, geobox)
def __get_mask_datasets(self) -> List[ODCDataset]: """ Finds mask datasets based on config """ dc = Datacube(app="mosaic_creator") time_range = (str(self.__start_date), str(self.__end_date)) datasets = dc.find_datasets(product=self.__product_name, time=time_range) if not datasets: LOGGER.warning("No mask datasets found for" f"product={self.__product_name}, time={time_range}") raise ValueError("No datasets found") # TODO: custom exception return datasets
def ordered_dss(dc: Datacube, freq: str = 'm', **query): """Emulate "order by time" streaming interface for datacube queries. Basic idea is to perform a lot of smaller queries (shorter time periods), sort results then yield them to the calling code. """ qq = Query(**query) for q in chop_query_by_time(qq, freq=freq): dss = dc.find_datasets(**q.search_terms) dss.sort(key=lambda ds: ds.center_time) yield from dss
def check_open_with_api(driver_manager, time_slices): from datacube import Datacube dc = Datacube(driver_manager=driver_manager) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = geometry.GeoBox(200, 200, Affine(25, 0.0, 638000, 0.0, -25, 6276000), geometry.CRS('EPSG:28355')) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values(), driver_manager=driver_manager) assert data.blue.shape == (time_slices, 200, 200)
def check_open_with_api(index): from datacube import Datacube dc = Datacube(index=index) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = GeoBox(200, 200, Affine(25, 0.0, 1500000, 0.0, -25, -3900000), CRS('EPSG:3577')) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values()) assert data.blue.shape == (1, 200, 200)
class ArbitraryTileMaker(object): """ Create a :class:`Tile` which can be used by :class:`GridWorkflow` to later load the required data. :param input_region: dictionary of spatial limits for searching for datasets. eg: geopolygon lat, lon boundaries """ def __init__(self, index, input_region, storage): self.dc = Datacube(index=index) self.input_region = input_region self.storage = storage def __call__(self, product, time, group_by) -> Tile: # Do for a specific poly whose boundary is known output_crs = CRS(self.storage['crs']) filtered_item = [ 'geopolygon', 'lon', 'lat', 'longitude', 'latitude', 'x', 'y' ] filtered_dict = { k: v for k, v in filter(lambda t: t[0] in filtered_item, self.input_region.items()) } if 'feature_id' in self.input_region: filtered_dict['geopolygon'] = Geometry( self.input_region['geom_feat'], CRS(self.input_region['crs_txt'])) geopoly = filtered_dict['geopolygon'] else: geopoly = query_geopolygon(**self.input_region) datasets = self.dc.find_datasets(product=product, time=time, group_by=group_by, **filtered_dict) group_by = query_group_by(group_by=group_by) sources = self.dc.group_datasets(datasets, group_by) output_resolution = [ self.storage['resolution'][dim] for dim in output_crs.dimensions ] geopoly = geopoly.to_crs(output_crs) geobox = GeoBox.from_geopolygon(geopoly, resolution=output_resolution) return Tile(sources, geobox)
def check_open_with_api(index, time_slices): with rasterio.Env(): from datacube import Datacube dc = Datacube(index=index) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = geometry.GeoBox(200, 200, Affine(25, 0.0, 638000, 0.0, -25, 6276000), geometry.CRS('EPSG:28355')) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values()) assert data.blue.shape == (time_slices, 200, 200) chunk_profile = {'time': 1, 'x': 100, 'y': 100} lazy_data = dc.load_data(sources, geobox, input_type.measurements.values(), dask_chunks=chunk_profile) assert lazy_data.blue.shape == (time_slices, 200, 200) assert (lazy_data.blue.load() == data.blue).all()
def check_data_with_api(index, time_slices): """Chek retrieved data for specific values. We scale down by 100 and check for predefined values in the corners. """ from datacube import Datacube dc = Datacube(index=index) # TODO: this test needs to change, it tests that results are exactly the # same as some time before, but with the current zoom out factor it's # hard to verify that results are as expected even with human # judgement. What it should test is that reading native from the # ingested product gives exactly the same results as reading into the # same GeoBox from the original product. Separate to that there # should be a read test that confirms that what you read from native # product while changing projection is of expected value # Make the retrieved data lower res ss = 100 shape_x = int(GEOTIFF['shape']['x'] / ss) shape_y = int(GEOTIFF['shape']['y'] / ss) pixel_x = int(GEOTIFF['pixel_size']['x'] * ss) pixel_y = int(GEOTIFF['pixel_size']['y'] * ss) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = geometry.GeoBox( shape_x + 2, shape_y + 2, Affine(pixel_x, 0.0, GEOTIFF['ul']['x'], 0.0, pixel_y, GEOTIFF['ul']['y']), geometry.CRS(GEOTIFF['crs'])) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values()) assert hashlib.md5( data.green.data).hexdigest() == '0f64647bad54db4389fb065b2128025e' assert hashlib.md5( data.blue.data).hexdigest() == '41a7b50dfe5c4c1a1befbc378225beeb' for time_slice in range(time_slices): assert data.blue.values[time_slice][-1, -1] == -999
def ordered_dss(dc: Datacube, freq: str = "m", key=None, **query): """Emulate "order by time" streaming interface for datacube queries. Basic idea is to perform a lot of smaller queries (shorter time periods), sort results then yield them to the calling code. :param dc: Datacube instance :param freq: 'm' month sized chunks, 'w' week sized chunks, 'd' day :param key: Optional sorting function Dataset -> Comparable, for example ``lambda ds: (ds.center_time, ds.metadata.region_code)`` """ qq = Query(**query) if key is None: key = lambda ds: ds.center_time for q in chop_query_by_time(qq, freq=freq): dss = dc.find_datasets(**q.search_terms) dss.sort(key=key) yield from dss
def query(self, dc: Datacube, **search_terms: Dict[str, Any]) -> VirtualDatasetBag: product = dc.index.products.get_by_name(self._product) if product is None: raise VirtualProductException("could not find product {}".format( self._product)) merged_terms = merge_search_terms( reject_keys(self, self._NON_QUERY_KEYS), reject_keys(search_terms, self._NON_QUERY_KEYS)) query = Query( dc.index, **reject_keys(merged_terms, self._ADDITIONAL_SEARCH_KEYS)) self._assert( query.product == self._product, "query for {} returned another product {}".format( self._product, query.product)) return VirtualDatasetBag(dc.find_datasets(**merged_terms), query.geopolygon, {product.name: product})
class DatacubeReplicator(object): def __init__(self, config): self.remote_host = config['remote_host'] self.remote_user = config['remote_user'] self.db_password = config['db_password'] self.remote_dir = config['remote_dir'] self.local_dir = config['local_dir'] self.replication_defns = config['replicated_data'] self.client = None self.sftp = None self.tunnel = None self.remote_dc_config = None self.remote_dc = None self.local_index = index_connect() def run(self): self.connect() self.read_remote_config() self.connect_to_db() self.replicate_all() self.disconnect() def connect(self): client = SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(WarningPolicy()) client.connect(hostname=self.remote_host, username=self.remote_user) LOG.debug(client) self.client = client self.sftp = client.open_sftp() def disconnect(self): self.client.close() self.tunnel.stop() def read_remote_config(self): remote_config = ConfigParser() remote_config.read_string(_DEFAULT_CONF) with self.sftp.open('.datacube.conf') as fin: remote_config.read_file(fin) self.remote_dc_config = LocalConfig(remote_config) def connect_to_db(self): self.tunnel = SSHTunnelForwarder( self.remote_host, ssh_username=self.remote_user, remote_bind_address=(self.remote_dc_config.db_hostname, int(self.remote_dc_config.db_port))) self.tunnel.start() # pylint: disable=protected-access self.remote_dc_config._config['datacube']['db_hostname'] = '127.0.0.1' self.remote_dc_config._config['datacube']['db_port'] = str( self.tunnel.local_bind_port) self.remote_dc_config._config['datacube'][ 'db_username'] = self.remote_user self.remote_dc_config._config['datacube'][ 'db_password'] = self.db_password # This requires the password from somewhere # Parsing it out of .pgpass sounds error prone and fragile # Lets put it in the configuration for now LOG.debug('Remote configuration loaded %s', self.remote_dc_config) self.remote_dc = Datacube(config=self.remote_dc_config) def replicate_all(self): for defn in tqdm(self.replication_defns, 'Replicating products'): self.replicate(defn) def replicate_all_products(self): products = self.remote_dc.index.products.get_all() for product in products: self.local_index.products.add(product) def replicate(self, defn): datasets = list(self.remote_dc.find_datasets(**defn)) if not datasets: LOG.info('No remote datasets found matching %s', defn) return # TODO: use generator not list product = datasets[0].type LOG.info('Ensuring remote product is in local index. %s', product) self.local_index.products.add(product) for dataset in tqdm(datasets, 'Datasets'): # dataset = remote_dc.index.datasets.get(dataset.id, include_sources=True) # We would need to pull the parent products down too # TODO: Include parent source datasets + product definitions dataset.sources = {} LOG.debug('Replicating dataset %s', dataset) remote_path = uri_to_path(dataset.local_uri) local_path = self.remote_to_local(uri_to_path(dataset.local_uri)) # Ensure local path exists Path(local_path).parent.mkdir(parents=True, exist_ok=True) # Download file self.sftp.get(remote_path, local_path) # Add to local index dataset.local_uri = 'file://' + local_path self.local_index.datasets.add(dataset) LOG.debug('Downloaded to %s', local_path) def remote_to_local(self, remote): return remote.replace(self.remote_dir, self.local_dir)
def get_data_opensource(prod_info, input_lon, input_lat, acq_min, acq_max, window_size, no_partial_scenes): datacube_config = prod_info[0] source_prod = prod_info[1] source_band_list = prod_info[2] mask_band = prod_info[3] with warnings.catch_warnings(): warnings.simplefilter("ignore") if datacube_config != 'default': remotedc = Datacube(config=datacube_config) else: remotedc = Datacube() return_data = {} data = xr.Dataset() if source_prod != '': # find dataset to get metadata fd_query = { 'time': (acq_min, acq_max), 'x': (input_lon, input_lon + window_size / 100000), 'y': (input_lat, input_lat + window_size / 100000), } sample_fd_ds = remotedc.find_datasets(product=source_prod, group_by='solar_day', **fd_query) if (len(sample_fd_ds)) > 0: # decidce pixel size for output data pixel_x, pixel_y = get_pixel_size(sample_fd_ds[0], source_band_list) log.info('Output pixel size for product {}: x={}, y={}'.format( source_prod, pixel_x, pixel_y)) # get target epsg from metadata target_epsg = get_epsg(sample_fd_ds[0]) log.info('CRS for product {}: {}'.format( source_prod, target_epsg)) x1, y1, x2, y2 = setQueryExtent(target_epsg, input_lon, input_lat, window_size) query = { 'time': (acq_min, acq_max), 'x': (x1, x2), 'y': (y1, y2), 'crs': target_epsg, 'output_crs': target_epsg, 'resolution': (-pixel_y, pixel_x), 'measurements': source_band_list } if 's2' in source_prod: data = remotedc.load(product=source_prod, group_by='solar_day', **query) else: data = remotedc.load(product=source_prod, align=(pixel_x / 2.0, pixel_y / 2.0), group_by='solar_day', **query) # remove cloud and nodta data = remove_cloud_nodata(source_prod, data, mask_band) if no_partial_scenes: # calculate valid data percentage data = only_return_whole_scene(data) return_data = { source_prod: { 'data': data, 'mask_band': mask_band, 'find_list': sample_fd_ds } } return return_data
def get_data_opensource_shapefile(prod_info, acq_min, acq_max, shapefile, no_partial_scenes): datacube_config = prod_info[0] source_prod = prod_info[1] source_band_list = prod_info[2] mask_band = prod_info[3] if datacube_config != 'default': remotedc = Datacube(config=datacube_config) else: remotedc = Datacube() with warnings.catch_warnings(): warnings.simplefilter("ignore") with fiona.open(shapefile) as shapes: crs = geometry.CRS(shapes.crs_wkt) first_geometry = next(iter(shapes))['geometry'] geom = geometry.Geometry(first_geometry, crs=crs) return_data = {} data = xr.Dataset() if source_prod != '': # get a sample dataset to decide the target epsg fd_query = {'time': (acq_min, acq_max), 'geopolygon': geom} sample_fd_ds = remotedc.find_datasets(product=source_prod, group_by='solar_day', **fd_query) if (len(sample_fd_ds)) > 0: # decidce pixel size for output data pixel_x, pixel_y = get_pixel_size(sample_fd_ds[0], source_band_list) log.info( 'Output pixel size for product {}: x={}, y={}'.format( source_prod, pixel_x, pixel_y)) # get target epsg from metadata target_epsg = get_epsg(sample_fd_ds[0]) log.info('CRS for product {}: {}'.format( source_prod, target_epsg)) query = { 'time': (acq_min, acq_max), 'geopolygon': geom, 'output_crs': target_epsg, 'resolution': (-pixel_y, pixel_x), 'measurements': source_band_list } if 's2' in source_prod: data = remotedc.load(product=source_prod, group_by='solar_day', **query) else: data = remotedc.load(product=source_prod, align=(pixel_x / 2.0, pixel_y / 2.0), group_by='solar_day', **query) # remove cloud and nodta data = remove_cloud_nodata(source_prod, data, mask_band) if data.data_vars: mask = geometry_mask([geom], data.geobox, invert=True) data = data.where(mask) if no_partial_scenes: # calculate valid data percentage data = only_return_whole_scene(data) return_data = { source_prod: { 'data': data, 'mask_band': mask_band, 'find_list': sample_fd_ds } } return return_data
#!/usr/bin/env python from datetime import date from datacube import Datacube dc = Datacube() # s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/2020-10-19/S2B_OPER_MSI_ARD_TL_VGS1_20201019T060322_A018905_T50LNP_N02.09/ARD-METADATA.yaml # # 's2a_ard_granule', # 's2b_ard_granule', # def ds_to_s3_url(ds): # return f"s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/{ds.key_time.strftime('%Y-%m-%d')}/{ds.metadata_doc['tile_id'].replace('L1C', 'ARD')}/ARD-METADATA.yaml" # Don't trust the datetimes that are exposed! return f"s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/{ds.metadata_doc['extent']['center_dt'][:10]}/{ds.metadata_doc['tile_id'].replace('L1C', 'ARD')}/ARD-METADATA.yaml" for product in ('s2a_ard_granule', 's2b_ard_granule'): for year in range(2017, date.today().year + 1): for month in range(1, 13): for ds in dc.find_datasets(product=product, time=f'{year}-{month:02}'): print(ds_to_s3_url(ds))
def get_l1c_datasets(self) -> List[ODCDataset]: """ Gets all L1C datasets from ODC Index """ dc = Datacube(app="cloud_mask_generator") l1c_datasets = dc.find_datasets(product=self.l1c_product_name) return l1c_datasets