def __init__(self, name: str = "ODCIndexer"): """ Sets up the indexer """ self.dc: Datacube = Datacube(app=name) self.session: Session = Session( aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], region_name='eu-central-1')
def cli( skip_lineage: bool, fail_on_missing_lineage: bool, verify_lineage: bool, uri: str, product: str, ): skips = [".*NBAR.*", ".*SUPPLEMENTARY.*", ".*NBART.*", ".*/QA/.*"] select = [".*ARD-METADATA.yaml"] candidate_products = product.split() print(f"Crawling {uri} on Thredds") print(f"Matching to {candidate_products}") yaml_urls = thredds_find_glob(uri, skips, select) print(f"Found {len(yaml_urls)} datasets") yaml_contents = download_yamls(yaml_urls) # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_list_to_odc( yaml_contents, dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def worker(): logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) host = os.getenv("REDIS_SERVICE_HOST", "redis-master") q = rediswq.RedisWQ(name="jobProduct", host=host) logging.info("Worker with sessionID %s.", q.sessionID()) logging.info("Initial queue state empty=%s.", q.empty()) host = os.getenv("DASK_SCHEDULER_HOST", "dask-scheduler.dask.svc.cluster.local") dask_client = Client(f"{host}:8786") dc = Datacube() s3_client = S3Client() lease_secs = int(os.getenv("JOB_LEASE_PERIOD", "3600")) while not q.empty(): item = q.lease(lease_secs=lease_secs, block=True, timeout=600) if item is not None: itemstr = item.decode("utf=8") logging.info("Working on %s.", itemstr) process_job(dc, dask_client, s3_client, itemstr, lease_secs) q.complete(item) else: logging.info("Waiting for work.") logging.info("Queue empty, exiting.")
def __call__(self, index, product, time, group_by) -> Tile: # Do for a specific poly whose boundary is known output_crs = CRS(self.storage['crs']) filtered_items = [ 'geopolygon', 'lon', 'lat', 'longitude', 'latitude', 'x', 'y' ] filtered_dict = { k: v for k, v in self.input_region.items() if k in filtered_items } if self.feature is not None: filtered_dict['geopolygon'] = self.feature.geopolygon geopoly = filtered_dict['geopolygon'] else: geopoly = query_geopolygon(**self.input_region) dc = Datacube(index=index) datasets = dc.find_datasets(product=product, time=time, group_by=group_by, **filtered_dict) group_by = query_group_by(group_by=group_by) sources = dc.group_datasets(datasets, group_by) output_resolution = [ self.storage['resolution'][dim] for dim in output_crs.dimensions ] geopoly = geopoly.to_crs(output_crs) geobox = GeoBox.from_geopolygon(geopoly, resolution=output_resolution) return Tile(sources, geobox)
def cli(limit, update_if_exists, bbox, product, add_product, workers): """ Index the Copernicus DEM automatically. """ if product not in PRODUCTS.keys(): raise ValueError( f"Unknown product {product}, must be one of {' '.join(PRODUCTS.keys())}" ) dc = Datacube() if add_product: add_cop_dem_product(dc, product) print(f"Indexing Copernicus DEM for {product} with bounding box of {bbox}") added, failed = cop_dem_to_dc(dc, product, bbox, limit, update_if_exists, n_workers=workers) print(f"Added {added} Datasets, failed {failed} Datasets") if failed > 0: sys.exit(failed)
def expected_bands(product_name): dc = Datacube(app='cog-worklist query') prod = dc.index.products.get_by_name(product_name) available_measurements = set(prod.measurements.keys()) # TODO: Implement black and white listing # Actually, maybe delete references to that since I don't believe it's used return available_measurements
def check_data_with_api(index, time_slices): """Chek retrieved data for specific values. We scale down by 100 and check for predefined values in the corners. """ from datacube import Datacube dc = Datacube(index=index) # Make the retrieved data 100 less granular shape_x = int(GEOTIFF['shape']['x'] / 100.0) shape_y = int(GEOTIFF['shape']['y'] / 100.0) pixel_x = int(GEOTIFF['pixel_size']['x'] * 100) pixel_y = int(GEOTIFF['pixel_size']['y'] * 100) input_type_name = 'ls5_nbar_albers' input_type = dc.index.products.get_by_name(input_type_name) geobox = geometry.GeoBox( shape_x + 1, shape_y + 1, Affine(pixel_x, 0.0, GEOTIFF['ul']['x'], 0.0, pixel_y, GEOTIFF['ul']['y']), geometry.CRS(GEOTIFF['crs'])) observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent) group_by = query_group_by('time') sources = dc.group_datasets(observations, group_by) data = dc.load_data(sources, geobox, input_type.measurements.values()) assert hashlib.md5( data.green.data).hexdigest() == '7f5ace486e88d33edf3512e8de6b6996' assert hashlib.md5( data.blue.data).hexdigest() == 'b58204f1e10dd678b292df188c242c7e' for time_slice in range(time_slices): assert data.blue.values[time_slice][-1, -1] == -999
def cli( skip_lineage: bool, fail_on_missing_lineage: bool, verify_lineage: bool, account_url: str, container_name: str, credential: str, product_names: List[str], prefix: str, suffix: str, ): print(f"Opening AZ Container {container_name} on {account_url}") print( f"Searching on prefix '{prefix}' for files matching suffix '{suffix}'") yaml_urls = find_blobs(account_url, container_name, credential, prefix, suffix) print(f"Found {len(yaml_urls)} datasets") yaml_contents = download_yamls(yaml_urls) print(f"Matching to {product_names} products") # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_list_to_odc( account_url, container_name, yaml_contents, dc, product_names, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage) print(f"Added {added} Datasets, Failed to add {failed} Datasets")
def run_one(config_file, input_dataset, environment=None): """ Run with CONFIG_FILE on a single INPUT_DATASET INPUT_DATASET may be either a URL or a Dataset ID """ alchemist = Alchemist(config_file=config_file, dc_env=environment) dc = Datacube(env=environment) try: ds = dc.index.datasets.get(input_dataset) except ValueError as e: _LOG.info("Couldn't find dataset with ID={} with exception {} trying by URL".format( input_dataset, e )) # Couldn't find a dataset by ID, try something if '://' in input_dataset: # Smells like a url input_url = input_dataset else: # Treat the input as a local file path input_url = Path(input_dataset).as_uri() ds = dc.index.datasets.get_datasets_for_location(input_url) # Currently this doesn't work by URL... TODO: fixme! task = alchemist.generate_task(ds) execute_task(task)
def get_mapped_crses(*product_names: str, index: Index = None) -> Iterable[Dict]: with Datacube(index=index) as dc: index = dc.index for product_name in product_names: product = index.products.get_by_name(product_name) # SQLAlchemy queries require "column == None", not "column is None" due to operator overloading: # pylint: disable=singleton-comparison res = ( alchemy_engine(index) .execute( select( [ literal(product.name).label("product"), get_dataset_srid_alchemy_expression( product.metadata_type ).label("crs"), ] ) .where(DATASET.c.dataset_type_ref == product.id) .where(DATASET.c.archived == None) .limit(1) ) .fetchone() ) if res: yield dict(res)
def test_wofs_filtered(): cfg = Config('../configs/template_client.yaml') grid_spec = GridSpec(crs=CRS('EPSG:3577'), tile_size=(100000, 100000), resolution=(-25, 25)) cell_index = (17, -39) wf = WofsFiltered(cfg, grid_spec, cell_index) confidence = wf.compute_confidence(cell_index) filtered = wf.compute_confidence_filtered() # Display images: to be removed later with Datacube(app='wofs_summary', env='dev') as dc: gwf = GridWorkflow(dc.index, grid_spec) indexed_tile = gwf.list_cells(cell_index, product='wofs_statistical_summary') # load the data of the tile dataset = gwf.load(tile=indexed_tile[cell_index], measurements=['frequency']) frequency = dataset.data_vars['frequency'].data.ravel().reshape( grid_spec.tile_resolution) # Check with previous run with rasterio.open('confidenceFilteredWOfS_17_-39_epsilon=10.tiff') as f: data = f.read(1) plt.subplot(221) plt.imshow(frequency) plt.subplot(222) plt.imshow(data) plt.subplot(223) plt.imshow(confidence) plt.subplot(224) plt.imshow(filtered) plt.show() wf.compute_and_write()
def test_init_null(null_config): from datacube.drivers.indexes import index_cache idxs = index_cache() assert "default" in idxs._drivers assert "null" in idxs._drivers with Datacube(config=null_config, validate_connection=True) as dc: assert (dc.index.url) == "null"
def load_tile_data(self, factors): """ Load and return factor data for confidence band prediction. :param factors: List of factor info as given by Config """ model_data = [] for fac in factors: factor = self.cfg.get_factor_info(fac) with Datacube(app='confidence_layer', env=factor['env']) as dc: gwf = GridWorkflow(dc.index, self.grid_spec) indexed_tiles = gwf.list_cells(self.tile_index, product=factor['product']) # load the data of the tile dataset = gwf.load(tile=indexed_tiles[self.tile_index], measurements=[factor['band']]) data = dataset.data_vars[factor['band']].data # Rescale where needed: Keep an eye on this since this is to do with different scaling factors used during # training than what is on datacube if factor['name'].startswith('phat'): data = data * 100.0 if factor['name'].startswith('phat'): data[data < 0.0] = 0.0 if factor['name'].startswith('mrvbf'): data[data > 10] = 10 if factor['name'].startswith('modis'): data[data > 100] = 100 model_data.append(data.ravel()) del data return np.column_stack(model_data)
def compute_confidence_filtered(self): """ Return the wofs filtered summary band data that is 10% filtered by confidence band. """ con_layer = self.compute_confidence() env = self.cfg.get_env_of_product('wofs_summary') with Datacube(app='wofs_summary', env=env) as dc: gwf = GridWorkflow(dc.index, self.grid_spec) indexed_tile = gwf.list_cells(self.tile_index, product='wofs_summary') # load the data of the tile dataset = gwf.load(tile=indexed_tile[self.tile_index], measurements=['frequency']) data = dataset.data_vars['frequency'].data.ravel().reshape( self.grid_spec.tile_resolution) con_filtering = self.cfg.cfg.get('confidence_filtering') threshold = None if con_filtering: threshold = con_filtering.get('threshold') if threshold: data[con_layer <= threshold] = DEFAULT_FLOAT_NODATA else: data[con_layer <= 0.10] = DEFAULT_FLOAT_NODATA return data
def calculate_index_task(params): item = params.get('item') index = params.get('index', 'rgb') dc = Datacube(config="datacube.conf") product = "ls8_level1_usgs" x = (item["bbox"][0], item["bbox"][2]) y = (item["bbox"][1], item["bbox"][3]) time = item["properties"]["datetime"].split("T")[0] measurements = ["band_2", "band_3", "band_4"] query = { 'x': x, 'y': y, 'time': time, 'measurements': ['nbart_red', 'nbart_green', 'nbart_blue'], 'output_crs': 'EPSG:4326', 'resolution': (-0.001, 0.001), } ds = dc.load(product=product, **query) print(ds) rgb_da = ds.to_array() suffix = 'rgb' filename = f'{item["id"]}_{suffix}.tif' path = config.STATIC_DIR / filename write_cog(geo_im=rgb_da, fname='rgb.tif', overwrite=True) return {"success": True, "url": str(path)}
def xadataset_from_odcdataset(datasets: Union[List[ODCDataset], ODCDataset] = None, ids: Union[List[UUID], UUID] = None, measurements: List[str] = None) -> xa.Dataset: """ Loads a xaDataset from ODCDatasets or ODCDataset ids :param datasets: ODCDataset(s), optional :param ids: ODCDataset id(s), optional :param measurements: list of measurements/bands to load, optional :return: xa.Dataset containing given ODCDatasets or IDs """ dc = Datacube(app="dataset_from_ODCDataset") if not datasets: if not isinstance(ids, list): ids = [ids] datasets = [dc.index.datasets.get(id_) for id_ in ids] if not isinstance(datasets, list): datasets = [datasets] product_name = datasets[0].metadata_doc["product"]["name"] crs = datasets[0].crs res = (10, -10) # TODO: handle other resolutions ds = dc.load(product=product_name, dask_chunks={}, measurements=measurements, output_crs=str(crs), resolution=res, datasets=datasets) return ds
def main(): config_yaml = """ sources: - product: ls8_nbar_albers measurements: [red, green, blue] group_by: solar_day date_ranges: start_date: 2014-06-01 end_date: 2014-07-01 storage: # this driver enables in-memory computation driver: xarray crs: EPSG:3577 tile_size: x: 40000.0 y: 40000.0 resolution: x: 25 y: -25 chunking: x: 200 y: 200 time: 1 dimension_order: [time, y, x] computation: chunking: x: 800 y: 800 input_region: tile: [15, -41] output_products: - name: nbar_mean statistic: simple statistic_args: reduction_function: mean """ # or manually creating a config dictionary works too config = yaml.load(config_yaml) print(yaml.dump(config, indent=4)) dc = Datacube() app = StatsApp(config, dc.index) print('generating tasks') tasks = app.generate_tasks() print('running tasks') for task in tasks: # this method is only available for the xarray output driver output = app.execute_task(task) print('result for {}'.format(task.tile_index)) print(output.result['nbar_mean'])
def parse_path(path, parse_only, folders, styles, input_file, output_file): try: raw_cfg = read_config(path) cfg = OWSConfig(refresh=True, cfg=raw_cfg) if not parse_only: with Datacube() as dc: cfg.make_ready(dc) except ConfigException as e: print("Config exception for path", str(e)) return False print("Configuration parsed OK") if folders: print() print("Folder/Layer Hierarchy") print("======================") print_layers(cfg.layers, styles, depth=0) print() elif styles: print() print("Layers and Styles") print("=================") for lyr in cfg.product_index.values(): print(lyr.name, f"[{','.join(lyr.product_names)}]") print_styles(lyr) print() if input_file or output_file: layers_report(cfg.product_index, input_file, output_file) return True
def get_dataset_values(product_name, product_config, time_range=None): """ Extract the file list corresponding to a product for the given year and month using datacube API. """ try: query = {**dict(product=product_name), **time_range} except TypeError: # Time range is None query = {**dict(product=product_name)} dc = Datacube(app='cog-worklist query') field_names = get_field_names(product_config) LOG.info( f"Perform a datacube dataset search returning only the specified fields, {field_names}." ) ds_records = dc.index.datasets.search_returning( field_names=tuple(field_names), **query) search_results = False for ds_rec in ds_records: search_results = True yield check_prefix_from_query_result(ds_rec, product_config) if not search_results: LOG.warning( f"Datacube product query is empty for {product_name} product with time-range, {time_range}" )
def cli(skip_lineage, fail_on_missing_lineage, verify_lineage, uri, product): """ Iterate through files in an S3 bucket and add them to datacube""" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher() # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, False) # Extract URL's from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_to_odc( fetcher(s3_url_stream), dc, product, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def test_multiple_environment_config(tmpdir): config_path = tmpdir.join('second.conf') config_path.write(""" [DEFAULT] db_username: test_user index_driver: default [default] db_hostname: db.opendatacube.test [test_alt] db_hostname: alt-db.opendatacube.test """) config_path = str(config_path) config = LocalConfig.find([config_path]) assert config['db_hostname'] == 'db.opendatacube.test' alt_config = LocalConfig.find([config_path], env='test_alt') assert alt_config['db_hostname'] == 'alt-db.opendatacube.test' # Make sure the correct config is passed through the API # Parsed config: db_url = 'postgresql://{user}@db.opendatacube.test:5432/datacube'.format( user=config['db_username']) alt_db_url = 'postgresql://{user}@alt-db.opendatacube.test:5432/datacube'.format( user=config['db_username']) with Datacube(config=config, validate_connection=False) as dc: assert str(dc.index.url) == db_url # When none specified, default environment is loaded with Datacube(config=str(config_path), validate_connection=False) as dc: assert str(dc.index.url) == db_url # When specific environment is loaded with Datacube(config=config_path, env='test_alt', validate_connection=False) as dc: assert str(dc.index.url) == alt_db_url # An environment that isn't in any config files with pytest.raises(ValueError): with Datacube(config=config_path, env='undefined-env', validate_connection=False) as dc: pass
def main(products, output_file, start_date, end_date, time_divs): """ Entry point. """ datacube = Datacube(app='find-those-gaps') summary = find_gaps(datacube, products, time_query(start_date, end_date), time_divs) yaml.dump(summary, output_file, default_flow_style=False)
def test_null_dataset_resource(null_config): with Datacube(config=null_config, validate_connection=True) as dc: assert dc.index.datasets.get(test_uuid) is None assert dc.index.datasets.bulk_get([test_uuid, "foo"]) == [] assert dc.index.datasets.get_derived(test_uuid) == [] assert not dc.index.datasets.has(test_uuid) assert dc.index.datasets.bulk_has([test_uuid, "foo"]) == [False, False] with pytest.raises(NotImplementedError) as e: dc.index.datasets.add(MagicMock()) with pytest.raises(NotImplementedError) as e: dc.index.datasets.can_update(MagicMock()) with pytest.raises(NotImplementedError) as e: dc.index.datasets.update(MagicMock()) with pytest.raises(NotImplementedError) as e: dc.index.datasets.archive([test_uuid, "foo"]) with pytest.raises(NotImplementedError) as e: dc.index.datasets.restore([test_uuid, "foo"]) with pytest.raises(NotImplementedError) as e: dc.index.datasets.purge([test_uuid, "foo"]) assert dc.index.datasets.get_all_dataset_ids(True) == [] assert dc.index.datasets.get_field_names() == [] assert dc.index.datasets.get_locations(test_uuid) == [] assert dc.index.datasets.get_archived_locations(test_uuid) == [] assert dc.index.datasets.get_archived_location_times(test_uuid) == [] assert dc.index.datasets.get_datasets_for_location( "http://a.uri/test") == [] with pytest.raises(NotImplementedError) as e: dc.index.datasets.add_location(test_uuid, "http://a.uri/test") with pytest.raises(NotImplementedError) as e: dc.index.datasets.remove_location(test_uuid, "http://a.uri/test") with pytest.raises(NotImplementedError) as e: dc.index.datasets.archive_location(test_uuid, "http://a.uri/test") with pytest.raises(NotImplementedError) as e: dc.index.datasets.restore_location(test_uuid, "http://a.uri/test") with pytest.raises(NotImplementedError) as e: dc.index.datasets.get_product_time_bounds("product1") assert dc.index.datasets.search_product_duplicates(MagicMock()) == [] assert dc.index.datasets.search_by_metadata({}) == [] assert dc.index.datasets.search(foo="bar", baz=12) == [] assert dc.index.datasets.search_by_product(foo="bar", baz=12) == [] assert dc.index.datasets.search_returning(["foo", "bar"], foo="bar", baz=12) == [] assert dc.index.datasets.count(foo="bar", baz=12) == 0 assert dc.index.datasets.count_by_product(foo="bar", baz=12) == [] assert dc.index.datasets.count_by_product_through_time("1 month", foo="bar", baz=12) == [] assert dc.index.datasets.count_product_through_time("1 month", foo="bar", baz=12) == [] assert dc.index.datasets.search_summaries(foo="bar", baz=12) == [] assert dc.index.datasets.search_eager(foo="bar", baz=12) == [] assert dc.index.datasets.search_returning_datasets_light( ("foo", "baz"), foo="bar", baz=12) == []
def test_null_user_resource(null_config): with Datacube(config=null_config, validate_connection=True) as dc: assert dc.index.users.list_users() == [] with pytest.raises(NotImplementedError) as e: dc.index.users.create_user("user1", "password2", "role1") with pytest.raises(NotImplementedError) as e: dc.index.users.delete_user("user1", "user2") with pytest.raises(NotImplementedError) as e: dc.index.users.grant_role("role1", "user1", "user2")
def collect_uris(prod_index, products, expressions): """ Collect all URIs of datasets from products matching search expressions. """ dc = Datacube(index=prod_index) for prod in products: for dataset in dc.find_datasets_lazy(product=prod, **expressions): yield normalize_uri(dataset.local_uri)
def test_multiple_environment_config(tmpdir): config_path = tmpdir.join('second.conf') config_path.write(""" [user] default_environment: test_default [test_default] db_hostname: db.opendatacube.test [test_alt] db_hostname: alt-db.opendatacube.test """) config_path = str(config_path) config = LocalConfig.find([config_path]) assert config.db_hostname == 'db.opendatacube.test' alt_config = LocalConfig.find([config_path], env='test_alt') assert alt_config.db_hostname == 'alt-db.opendatacube.test' # Lazily connect: they shouldn't try to connect during this test as we're not using the API args = dict(validate_connection=False) # Make sure the correct config is passed through the API # Parsed config: db_url = 'postgresql://{user}@db.opendatacube.test:5432/datacube'.format(user=config.db_username) alt_db_url = 'postgresql://{user}@alt-db.opendatacube.test:5432/datacube'.format(user=config.db_username) with Datacube(config=config, **args) as dc: assert str(dc.index.url) == db_url # When none specified, default environment is loaded with Datacube(config=str(config_path), **args) as dc: assert str(dc.index.url) == db_url # When specific environment is loaded with Datacube(config=config_path, env='test_alt', **args) as dc: assert str(dc.index.url) == alt_db_url # An environment that isn't in any config files with pytest.raises(ValueError): with Datacube(config=config_path, env='undefined-env', **args) as dc: pass
def __get_mask_datasets(self) -> List[ODCDataset]: """ Finds mask datasets based on config """ dc = Datacube(app="mosaic_creator") time_range = (str(self.__start_date), str(self.__end_date)) datasets = dc.find_datasets(product=self.__product_name, time=time_range) if not datasets: LOGGER.warning("No mask datasets found for" f"product={self.__product_name}, time={time_range}") raise ValueError("No datasets found") # TODO: custom exception return datasets
def get_sample_dataset(*product_names: str, index: Index = None) -> Iterable[Dict]: with Datacube(index=index) as dc: index = dc.index for product_name in product_names: product = index.products.get_by_name(product_name) res = (alchemy_engine(index).execute( _select_dataset_extent_query(product).limit(1)).fetchone()) if res: yield dict(res)
def setUp(self): self.datacube = Datacube(config=DATACUBE_CONFIG) IndexerTestCase().test_product_generation() product = self.datacube.index.products.get_by_name('ls8_test') if product is None: self.skipTest('No product available to index') datasets = self.datacube.find_datasets(product='ls8_test') if datasets: self.skipTest('Indexed datasets already exist in database')
def _get_factor_datasets(self): dts = [] for fac in self.confidence_model.factors: factor = self.cfg.get_factor_info(fac) with Datacube(app='confidence_layer', env=factor['env']) as dc: gwf = GridWorkflow(dc.index, self.grid_spec) obs = gwf.cell_observations(cell_index=self.tile_index, product=factor['product']) for ds in obs[self.tile_index]['datasets']: dts.append(ds) return dts