def populate_index(dataset_loader, module_dea_index): """ Index populated with example datasets. Assumes our tests wont modify the data! It's module-scoped as it's expensive to populate. """ path, s2_product_doc = list( read_documents(TEST_DATA_DIR / "esa_s2_l2a.product.yaml"))[0] dataset_count = 0 product_ = module_dea_index.products.from_doc(s2_product_doc) module_dea_index.products.add(product_) create_dataset = Doc2Dataset(module_dea_index) for _, s2_dataset_doc in read_documents(TEST_DATA_DIR / "s2_l2a-sample.yaml"): try: dataset, err = create_dataset(s2_dataset_doc, "file://example.com/test_dataset/") assert dataset is not None, err created = module_dea_index.datasets.add(dataset) assert created.type.name == "s2_l2a" dataset_count += 1 except AttributeError as ae: assert dataset_count == 5 print(ae) assert dataset_count == 5 return module_dea_index
def add_dataset(doc, uri, index, sources_policy): logging.info("Indexing dataset: {} with URI: {}".format(doc['id'], uri)) resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) existing_dataset = index.datasets.get(doc['id']) if not existing_dataset: logging.info("Trying to index") if err is not None: logging.error("%s", err) else: try: index.datasets.add( dataset, with_lineage=False ) # Source policy to be checked in sentinel 2 dataset types except Exception as e: logging.error("Unhandled exception %s", e) else: logging.info("Updating dataset instead.") try: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: logging.error("Unhandled exception %s", e) return dataset, err
def from_metadata_stream(metadata_stream, index, **kwargs): """ Given a stream of (uri, metadata) tuples convert them into Datasets, using supplied index and options for Doc2Dataset. **kwargs**: - skip_lineage - verify_lineage - fail_on_missing_lineage - products - exclude_products returns a sequence of tuples where each tuple is either (Dataset, None) or (None, error_message) """ doc2ds = Doc2Dataset(index, **kwargs) for uri, metadata in metadata_stream: if metadata is None: yield (None, "Error: empty doc %s" % (uri)) else: ds, err = doc2ds(metadata, uri) if ds is not None: yield (ds, None) else: yield (None, "Error: %s, %s" % (uri, err))
def add_dataset(doc, product_name, uri, index): doc_id = doc['id'] logging.info("Indexing dataset: {} with URI: {}".format(doc_id, uri)) print(f'type(doc_id): {type(doc_id)}') print(f'type(uri): {type(uri)}') resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) print(f'dataset: {dataset}') print(f'err: {err}') existing_dataset = index.datasets.get(doc_id) if not existing_dataset: logging.info("Trying to index") if err is not None: logging.error("%s", err) else: try: index.datasets.add( dataset, with_lineage=False ) # Source policy to be checked in sentinel 2 dataset types except Exception as e: logging.error("Unhandled exception %s", e) else: logging.info("Updating dataset instead.") try: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: logging.error("Unhandled exception %s", e) return dataset, err
def add_dataset(doc, uri, index, sources_policy=None, update=None, **kwargs): ''' Add a dataset document to the index database. Args: doc: The dataset document. uri: Some URI to point to the document (this doesn't have to actually point anywhere). index: An instance of a datacube index. sources_policy (optional): The source policy to be checked. update: Update datasets if they already exist. Returns: The dataset to be indexed and any errors encountered. ''' from datacube.index.hl import Doc2Dataset from datacube.utils import changes resolver = Doc2Dataset(index, **kwargs) dataset, err = resolver(sanitize_inf(doc), uri) buff = io.StringIO() if err is None: with redirect_stderr(buff): if update and index.datasets.get(dataset.id): index.datasets.update(dataset, {tuple(): changes.allow_any}) else: index.datasets.add(dataset, sources_policy=sources_policy) val = buff.getvalue() if val.count('is already in the database'): def warning_without_trace(message, *args, **kwargs): return f'{message}' warnings.formatwarning = warning_without_trace warnings.warn(val) else: raise ValueError(err) return dataset
def add_dataset(doc, uri, index, sources_policy): """Add dataset documentation to datacube :param doc: Dict of parameters to index :type doc: dict :param uri: URI to metadata file for the tile :type uri: str :param index: Datacube index :type index: datacube.index :param sources_policy: Source policy :type sources_policy: str :return: dataset or error :rtype: dataset or error """ logging.info("Adding %s to index", uri) resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) if err is not None: logging.error("%s", err) else: try: index.datasets.add(dataset, sources_policy=sources_policy) except changes.DocumentMismatchError as e: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: err = e logging.error("Unhandled exception %s", e) return dataset, err
def add_dataset(doc): dc = datacube.Datacube(config=config.DATACUBE_CONF) index = dc.index resolver = Doc2Dataset(index) dataset, error = resolver(doc, 'file:///tmp/test-dataset.json') print('add dataset', dataset) index.datasets.add(dataset)
def item2dataset_cli(stac_collection, dc_product, url, outdir, max_items, engine_file, datacube_config, verbose, access_token, advanced_filter): _filter = {"collections": [stac_collection]} if advanced_filter: _filter = {**_filter, **prepare_advanced_filter(advanced_filter)} stac_service = stac.STAC(url, False, access_token=access_token) dc_index = datacube_index(datacube_config) features = create_feature_collection_from_stac_elements( stac_service, int(max_items), _filter) odc_datasets = stac2odc.item.item2dataset(engine_file, dc_product, features, dc_index, verbose=verbose) odc_datasets_definition_files = write_odc_element_in_yaml_file( odc_datasets, outdir) # add datasets definitions on datacube index # code adapted from: https://github.com/opendatacube/datacube-core/blob/develop/datacube/scripts/dataset.py ds_resolve = Doc2Dataset(dc_index, [dc_product]) doc_stream = remap_uri_from_doc( ui_path_doc_stream(odc_datasets_definition_files, uri=True)) datasets_on_stream = dataset_stream(doc_stream, ds_resolve) logger_message(f"Adding datasets", logger.info, True) for dataset in datasets_on_stream: try: dc_index.datasets.add(dataset, with_lineage=True) except (ValueError, MissingRecordError): logger_message(f"Error to add dataset ({dataset.local_uri})", logger.warning, True)
def dump_to_odc( document_stream, dc: Datacube, products: list, transform=None, update=False, update_if_exists=False, allow_unsafe=False, **kwargs, ) -> Tuple[int, int]: doc2ds = Doc2Dataset(dc.index, products=products, **kwargs) ds_added = 0 ds_failed = 0 uris_docs = parse_doc_stream(stream_docs(document_stream), on_error=doc_error, transform=transform) for uri, metadata in uris_docs: try: index_update_dataset(metadata, uri, dc, doc2ds, update, update_if_exists, allow_unsafe) ds_added += 1 except (IndexingException) as e: logging.exception(f"Failed to index dataset {uri} with error {e}") ds_failed += 1 return ds_added, ds_failed
def show(index, path): file_paths = find_lpdaac_file_paths(Path(path)) print(file_paths) _ = Doc2Dataset(index) for file_path in file_paths: doc = generate_lpdaac_doc(file_path) print_dict(doc)
def archive_document(doc, uri, index, sources_policy, require_lineage): def get_ids(dataset): ds = index.datasets.get(dataset.id, include_sources=True) for source in ds.sources.values(): yield source.id yield dataset.id resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) index.datasets.archive(get_ids(dataset)) logging.info("Archiving %s and all sources of %s", dataset.id, dataset.id)
def stac_api_to_odc( dc: Datacube, update_if_exists: bool, config: dict, catalog_href: str, allow_unsafe: bool = True, rewrite: Optional[Tuple[str, str]] = None, ) -> Tuple[int, int]: doc2ds = Doc2Dataset(dc.index) client = Client.open(catalog_href) search = client.search(**config) n_items = search.matched() if n_items is not None: logging.info("Found {} items to index".format(n_items)) if n_items == 0: logging.warning("Didn't find any items, finishing.") return 0, 0 else: logging.warning("API did not return the number of items.") # Do the indexing of all the things success = 0 failure = 0 sys.stdout.write("\rIndexing from STAC API...\n") with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor: future_to_item = { executor.submit( process_item, item, dc, doc2ds, update_if_exists=update_if_exists, allow_unsafe=allow_unsafe, rewrite=rewrite, ): item.id for item in search.get_all_items() } for future in concurrent.futures.as_completed(future_to_item): item = future_to_item[future] try: _ = future.result() success += 1 if success % 10 == 0: sys.stdout.write(f"\rAdded {success} datasets...") except Exception as e: logging.exception(f"Failed to handle item {item} with exception {e}") failure += 1 sys.stdout.write("\r") return success, failure
def index_cmd(index, product_names, exclude_product_names, auto_match, auto_add_lineage, verify_lineage, dry_run, ignore_lineage, confirm_ignore_lineage, dataset_paths): if confirm_ignore_lineage is False and ignore_lineage is True: if sys.stdin.isatty(): confirmed = click.confirm( "Requested to skip lineage information, Are you sure?", default=False) if not confirmed: click.echo('OK aborting', err=True) sys.exit(1) else: click.echo( "Use --confirm-ignore-lineage from non-interactive scripts. Aborting." ) sys.exit(1) confirm_ignore_lineage = True if auto_match is True: _LOG.warning( "--auto-match option is deprecated, update your scripts, behaviour is the same without it" ) try: ds_resolve = Doc2Dataset(index, product_names, exclude_products=exclude_product_names, skip_lineage=confirm_ignore_lineage, fail_on_missing_lineage=not auto_add_lineage, verify_lineage=verify_lineage) except ValueError as e: _LOG.error(e) sys.exit(2) def run_it(dataset_paths): doc_stream = ui_path_doc_stream(dataset_paths, logger=_LOG, uri=True) dss = dataset_stream(doc_stream, ds_resolve) index_datasets(dss, index, auto_add_lineage=auto_add_lineage, dry_run=dry_run) # If outputting directly to terminal, show a progress bar. if sys.stdout.isatty(): with click.progressbar(dataset_paths, label='Indexing datasets') as pp: run_it(pp) else: run_it(dataset_paths)
def cop_dem_to_dc( dc: Datacube, product: str, bounding_box, limit: int, update: bool, n_workers: int = 100, ) -> Tuple[int, int]: doc2ds = Doc2Dataset(dc.index) # Get a generator of (uris) uris_tiles = list(get_dem_tile_uris(bounding_box, product)) if limit: uris_tiles = uris_tiles[0:limit] # Do the indexing of all the things success = 0 failure = 0 sys.stdout.write( f"Starting Cop DEM indexing with {n_workers} workers...\n") with concurrent.futures.ThreadPoolExecutor( max_workers=n_workers) as executor: future_to_uri = { executor.submit(process_uri_tile, uri_tile, product, dc, doc2ds, update_if_exists=update): uri_tile[0] for uri_tile in uris_tiles } for future in concurrent.futures.as_completed(future_to_uri): uri = future_to_uri[future] try: _ = future.result() success += 1 if success % 10 == 0: sys.stdout.write(f"\rAdded {success} datasets...") except rasterio.errors.RasterioIOError: logging.info(f"Couldn't find file for {uri}") except Exception as e: logging.exception( f"Failed to handle uri {uri} with exception {e}") failure += 1 sys.stdout.write("\r") return success, failure
def add_dataset(doc, uri, index, sources_policy): logging.info("Indexing %s", uri) resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) if err is not None: logging.error("%s", err) try: index.datasets.add(dataset, sources_policy=sources_policy) # Source policy to be checked in sentinel 2 datase types except changes.DocumentMismatchError as e: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: logging.error("Unhandled exception %s", e) return uri
def index_data(index, path): path = Path(path) datasets = find_datasets(path) resolver = Doc2Dataset(index) for name, dataset in datasets.items(): doc = generate_dataset_doc(name, dataset) print_dict(doc) dataset, err = resolver(doc, path.as_uri()) if err is not None: logging.error("%s", err) try: index.datasets.add(dataset) except Exception as e: logging.error("Couldn't index %s%s", path, name) logging.exception("Exception", e)
def index_data(index, path): file_paths = find_lpdaac_file_paths(Path(path)) print(file_paths) resolver = Doc2Dataset(index) for file_path in file_paths: doc = generate_lpdaac_doc(file_path) print_dict(doc) dataset, err = resolver(doc, file_path.as_uri()) if err is not None: logging.error("%s", err) try: index.datasets.add(dataset) except Exception as e: logging.error("Couldn't index %s", file_path) logging.exception("Exception", e)
def index_dataset(dataset_dict, s3_path): dc = datacube.Datacube() index = dc.index resolver = Doc2Dataset(index) dataset, err = resolver(dataset_dict, s3_path) if err is not None: logging.error("%s", err) else: try: index.datasets.add(dataset) except changes.DocumentMismatchError as e: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: err = e logging.error("Unhandled exception {}".format(e)) return dataset, err
def item_to_dataset( dc_index: index.Index, product_name: str, item: dict ) -> model.Dataset: doc2ds = Doc2Dataset(index=dc_index, products=[product_name]) uri, relative = guess_location(item) if relative: metadata = stac_transform(item) else: metadata = stac_transform_absolute(item) ds, err = doc2ds(metadata, uri) if ds is not None: return ds
def add_dataset(doc, uri, index, sources_policy, require_lineage): logging.info("Indexing %s", uri) skip_lineage = not require_lineage resolver = Doc2Dataset(index, skip_lineage=skip_lineage) dataset, err = resolver(doc, uri) if err is not None: logging.error("%s", err) else: try: # TODO: Sources policy to be checked in sentinel 2 dataset types index.datasets.add(dataset, sources_policy=sources_policy, with_lineage=require_lineage) except changes.DocumentMismatchError as e: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as e: err = e logging.error("Unhandled exception %s", e) return dataset, err
def ingest_sentinel1_grd_50m_beta0(path, uuid): dc = datacube.Datacube() resolver = Doc2Dataset(dc.index) projection, extent, (t0, t1) = get_geometry(path) images = {v: {'path': path, 'layer': i + 1} for i, v in enumerate(bands)} p = Path(path) scene_name = p.stem[:-11] result = { # 'id': str(uuid.uuid4()), # Generate random uuid 'id': str(uuid), 'processing_level': "Level-1", 'product_type': "sentinel_1_grd_50m_beta0", 'creation_dt': t0, 'platform': { 'code': 'SENTINEL_1A' }, 'instrument': { 'name': 'SAR' }, 'extent': { 'coord': extent, 'from_dt': str(t0), 'to_dt': str(t1), 'center_dt': str(t0 + (t1 - t0) / 2) }, 'format': { 'name': 'GeoTIFF' }, # ENVI or BEAM-DIMAP ? 'grid_spatial': { 'projection': projection }, 'image': { 'bands': images }, 'lineage': { 'source_datasets': {}, 'ga_label': scene_name } } print(result) dataset, _ = resolver(result, '') dc.index.datasets.add(dataset) return True
def _populate_from_dump(expected_type: str, dump_path: Path): ls8_nbar_scene = module_dea_index.products.get_by_name(expected_type) dataset_count = 0 create_dataset = Doc2Dataset(module_dea_index) for _, doc in read_documents(dump_path): label = doc['ga_label'] if ('ga_label' in doc) else doc['id'] # type: Tuple[Dataset, str] dataset, err = create_dataset(doc, f"file://example.com/test_dataset/{label}") assert dataset is not None, err assert dataset.type.name == expected_type created = module_dea_index.datasets.add(dataset) assert created.type.name == ls8_nbar_scene.name dataset_count += 1 print(f"Populated {dataset_count} of {expected_type}") return dataset_count
def _populate_from_dump(session_dea_index, expected_type: str, dump_path: Path): ls8_nbar_scene = session_dea_index.products.get_by_name(expected_type) dataset_count = 0 create_dataset = Doc2Dataset(session_dea_index) for _, doc in read_documents(dump_path): label = doc["ga_label"] if ("ga_label" in doc) else doc["id"] dataset, err = create_dataset( doc, f"file://example.com/test_dataset/{label}") assert dataset is not None, err created = session_dea_index.datasets.add(dataset) assert created.type.name == ls8_nbar_scene.name dataset_count += 1 print(f"Populated {dataset_count} of {expected_type}") return dataset_count
def archive_document(doc, uri, index, sources_policy): """Archive dataset :param doc: Dict of parameters that reference the dataset :type doc: dict :param uri: URI to metadata file for the tile :type uri: str :param index: Datacube index :type index: datacube.index :param sources_policy: Source policy :type sources_policy: str """ def get_ids(dataset): ds = index.datasets.get(dataset.id, include_sources=True) for source in ds.sources.values(): yield source.id yield dataset.id resolver = Doc2Dataset(index) dataset, err = resolver(doc, uri) index.datasets.archive(get_ids(dataset)) logging.info("Archiving %s and all sources of %s", dataset.id, dataset.id)
def add_dataset(self, eo3_doc: Dict, uri: str = "", **kwargs) -> (ODCDataset, Union[Exception, None]): """ Adds dataset to dcIndex """ if not uri: uri = eo3_doc["uri"] LOGGER.debug(f"Indexing {uri}") index = self.dc.index resolver = Doc2Dataset(index, **kwargs) dataset, err = resolver(eo3_doc, uri) if err is not None: LOGGER.error(f"Error indexing {uri}: {err}") return dataset, err try: index.datasets.add(dataset) except DocumentMismatchError: index.datasets.update(dataset, {tuple(): changes.allow_any}) except Exception as err: LOGGER.error(f"Unhandled exception {err}") pass return dataset, err
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str): """ Index a dataset from a file uri. A better api should be pushed upstream to core: it currently only has a "scripts" implementation intended for cli use. """ yaml_path = uri_to_local_path(uri) def load_datasets(path, ds_resolve): for uri, ds in ui_path_doc_stream(path): dataset, err = ds_resolve(ds, uri) if dataset is None: _LOG.error('dataset is empty', error=str(err)) continue is_consistent, reason = check_dataset_consistent(dataset) if not is_consistent: _LOG.error("dataset inconsistency", dataset=dataset.id, reason=str(reason)) continue yield dataset ds_resolve = Doc2Dataset(index) for d in load_datasets([yaml_path], ds_resolve): if d.id == dataset_id: try: index.datasets.add(d) _LOG.info("dataset indexing successful", dataset_id=dataset_id) break except ValueError as err: _LOG.error('failed to index dataset', dataset_id=dataset_id, error=err) else: raise RuntimeError('dataset not found at path: %s, %s' % (dataset_id, uri))
def stac_api_to_odc( dc: Datacube, products: list, limit: int, update: bool, allow_unsafe: bool, config: dict, **kwargs, ) -> Tuple[int, int]: # QA the BBOX if config["bbox"]: assert ( len(config["bbox"]) == 4 ), "Bounding box must be of the form lon-min,lat-min,lon-max,lat-max" # QA the search srch = Search().search(**config) n_items = srch.found() logging.info("Found {} items to index".format(n_items)) if n_items > 10000: logging.warning( "More than 10,000 items were returned by your query, which is greater than the API limit" ) if n_items == 0: logging.warning("Didn't find any items, finishing.") return 0, 0 # Get a generator of (stac, uri, relative_uri) tuples potential_items = get_items(srch, limit) # Get a generator of (dataset, uri) doc2ds = Doc2Dataset(dc.index, **kwargs) datasets = transform_items(doc2ds, potential_items) # Do the indexing of all the things return index_update_datasets(dc, datasets, update, allow_unsafe)
def from_json_lines(lines, index, **kwargs): doc2ds = Doc2Dataset(index, **kwargs) for lineno, l in enumerate(lines): try: doc = json.loads(l) except json.JSONDecodeError as e: print('Error[%d]: %s' % (lineno, str(e))) uri = toolz.get_in(['uris', 0], doc) if uri is None: print('Error[%d]: missing uri' % lineno) continue metadata = doc.get('metadata') if metadata is None: print('Error[%d]: missing metadata' % lineno) continue ds, err = doc2ds(metadata, uri) if ds is not None: yield ds else: print('Error[%d]: %s' % (lineno, err))
def cli(input_directory, update_if_exists, allow_unsafe, stac, glob): dc = datacube.Datacube() doc2ds = Doc2Dataset(dc.index) if glob is None: glob = "**/*.json" if stac else "**/*.yaml" files_to_process = _find_files(input_directory, glob, stac=stac) added, failed = 0, 0 for in_file in files_to_process: with in_file.open() as f: try: if in_file.endswith(".yml") or in_file.endswith(".yaml"): metadata = yaml.safe_load(f, Loader=Loader) else: metadata = json.load(f) # Do the STAC Transform if it's flagged if stac: metadata = stac_transform(metadata) index_update_dataset( metadata, in_file.absolute().as_uri(), dc=dc, doc2ds=doc2ds, update_if_exists=update_if_exists, allow_unsafe=allow_unsafe, ) added += 1 except Exception as e: logging.exception(f"Failed to add dataset {in_file} with error {e}") failed += 1 logging.info(f"Added {added} and failed {failed} datasets.")
def test_dataset_add(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty r = clirunner(['dataset', 'add', p.datasets], expect_success=False) assert r.exit_code != 0 assert 'Found no products' in r.output clirunner(['metadata', 'add', p.metadata]) clirunner(['product', 'add', p.products]) clirunner(['dataset', 'add', p.datasets]) clirunner(['dataset', 'add', p.datasets_bad1]) clirunner(['dataset', 'add', p.datasets_eo3]) ds = load_dataset_definition(p.datasets) ds_bad1 = load_dataset_definition(p.datasets_bad1) # Check .hl.Doc2Dataset doc2ds = Doc2Dataset(index) _ds, _err = doc2ds(ds.doc, 'file:///something') assert _err is None assert str(_ds.id) == ds.id assert _ds.metadata_doc == ds.doc # Check dataset search r = clirunner(['dataset', 'search'], expect_success=True) assert ds.id in r.output assert ds_bad1.id not in r.output assert ds.sources['ab'].id in r.output assert ds.sources['ac'].sources['cd'].id in r.output r = clirunner(['dataset', 'info', '-f', 'csv', ds.id]) assert ds.id in r.output r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id]) assert ds.sources['ae'].id in r.output r = clirunner([ 'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id ]) assert ds.id in r.output ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) assert ds_.id == ds.id x = index.datasets.get(ds.id, include_sources=True) assert str(x.sources['ab'].id) == ds.sources['ab'].id assert str( x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id check_skip_lineage_test(clirunner, index) check_no_product_match(clirunner, index) check_with_existing_lineage(clirunner, index) check_inconsistent_lineage(clirunner, index) check_missing_metadata_doc(clirunner) check_missing_lineage(clirunner, index) check_no_confirm(clirunner, p.datasets) check_bad_yaml(clirunner, index) # check --product=nosuchproduct r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets], expect_success=False) assert "ERROR Supplied product name" in r.output assert r.exit_code != 0 # Check that deprecated option is accepted r = clirunner(['dataset', 'add', '--auto-match', p.datasets]) assert 'WARNING --auto-match option is deprecated' in r.output # test dataset add eo3 r = clirunner(['dataset', 'add', p.datasets_eo3]) assert r.exit_code == 0 ds_eo3 = load_dataset_definition(p.datasets_eo3) assert ds_eo3.location is not None _ds = index.datasets.get(ds_eo3.id, include_sources=True) assert sorted(_ds.sources) == ['a', 'bc1', 'bc2'] assert _ds.crs == 'EPSG:3857' assert _ds.extent is not None assert _ds.extent.crs == _ds.crs assert _ds.uris == [ds_eo3.location] assert 'location' not in _ds.metadata_doc