def populate_index(dataset_loader, module_dea_index):
    """
    Index populated with example datasets. Assumes our tests wont modify the data!

    It's module-scoped as it's expensive to populate.
    """
    path, s2_product_doc = list(
        read_documents(TEST_DATA_DIR / "esa_s2_l2a.product.yaml"))[0]
    dataset_count = 0
    product_ = module_dea_index.products.from_doc(s2_product_doc)
    module_dea_index.products.add(product_)
    create_dataset = Doc2Dataset(module_dea_index)
    for _, s2_dataset_doc in read_documents(TEST_DATA_DIR /
                                            "s2_l2a-sample.yaml"):
        try:
            dataset, err = create_dataset(s2_dataset_doc,
                                          "file://example.com/test_dataset/")
            assert dataset is not None, err
            created = module_dea_index.datasets.add(dataset)
            assert created.type.name == "s2_l2a"
            dataset_count += 1
        except AttributeError as ae:
            assert dataset_count == 5
            print(ae)
    assert dataset_count == 5
    return module_dea_index
Beispiel #2
0
def add_dataset(doc, uri, index, sources_policy):
    logging.info("Indexing dataset: {} with URI:  {}".format(doc['id'], uri))

    resolver = Doc2Dataset(index)
    dataset, err = resolver(doc, uri)
    existing_dataset = index.datasets.get(doc['id'])

    if not existing_dataset:
        logging.info("Trying to index")
        if err is not None:
            logging.error("%s", err)
        else:
            try:
                index.datasets.add(
                    dataset, with_lineage=False
                )  # Source policy to be checked in sentinel 2 dataset types
            except Exception as e:
                logging.error("Unhandled exception %s", e)
    else:
        logging.info("Updating dataset instead.")
        try:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as e:
            logging.error("Unhandled exception %s", e)

    return dataset, err
Beispiel #3
0
def from_metadata_stream(metadata_stream, index, **kwargs):
    """
    Given a stream of (uri, metadata) tuples convert them into Datasets, using
    supplied index and options for Doc2Dataset.


    **kwargs**:
    - skip_lineage
    - verify_lineage
    - fail_on_missing_lineage
    - products
    - exclude_products

    returns a sequence of tuples where each tuple is either

        (Dataset, None) or (None, error_message)
    """
    doc2ds = Doc2Dataset(index, **kwargs)

    for uri, metadata in metadata_stream:
        if metadata is None:
            yield (None, "Error: empty doc %s" % (uri))
        else:
            ds, err = doc2ds(metadata, uri)
            if ds is not None:
                yield (ds, None)
            else:
                yield (None, "Error: %s, %s" % (uri, err))
Beispiel #4
0
def add_dataset(doc, product_name, uri, index):
    doc_id = doc['id']
    logging.info("Indexing dataset: {} with URI: {}".format(doc_id, uri))
    print(f'type(doc_id): {type(doc_id)}')
    print(f'type(uri): {type(uri)}')

    resolver = Doc2Dataset(index)
    dataset, err = resolver(doc, uri)
    print(f'dataset: {dataset}')
    print(f'err: {err}')
    existing_dataset = index.datasets.get(doc_id)

    if not existing_dataset:
        logging.info("Trying to index")
        if err is not None:
            logging.error("%s", err)
        else:
            try:
                index.datasets.add(
                    dataset, with_lineage=False
                )  # Source policy to be checked in sentinel 2 dataset types
            except Exception as e:
                logging.error("Unhandled exception %s", e)
    else:
        logging.info("Updating dataset instead.")
        try:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as e:
            logging.error("Unhandled exception %s", e)

    return dataset, err
Beispiel #5
0
def add_dataset(doc, uri, index, sources_policy=None, update=None, **kwargs):
    ''' Add a dataset document to the index database.

    Args:
        doc: The dataset document.
        uri: Some URI to point to the document (this doesn't have to actually point anywhere).
        index: An instance of a datacube index.
        sources_policy (optional): The source policy to be checked.
        update: Update datasets if they already exist.
    Returns: The dataset to be indexed and any errors encountered.
    '''
    from datacube.index.hl import Doc2Dataset
    from datacube.utils import changes

    resolver = Doc2Dataset(index, **kwargs)
    dataset, err = resolver(sanitize_inf(doc), uri)
    buff = io.StringIO()
    if err is None:
        with redirect_stderr(buff):
            if update and index.datasets.get(dataset.id):
                index.datasets.update(dataset, {tuple(): changes.allow_any})
            else:
                index.datasets.add(dataset, sources_policy=sources_policy)
        val = buff.getvalue()
        if val.count('is already in the database'):

            def warning_without_trace(message, *args, **kwargs):
                return f'{message}'

            warnings.formatwarning = warning_without_trace
            warnings.warn(val)
    else:
        raise ValueError(err)
    return dataset
def add_dataset(doc, uri, index, sources_policy):
    """Add dataset documentation to datacube
    
    :param doc: Dict of parameters to index
    :type doc: dict
    :param uri: URI to metadata file for the tile
    :type uri: str
    :param index: Datacube index
    :type index: datacube.index
    :param sources_policy: Source policy  
    :type sources_policy: str
    :return: dataset or error
    :rtype: dataset or error
    """

    logging.info("Adding %s to index", uri)

    resolver = Doc2Dataset(index)
    dataset, err = resolver(doc, uri)

    if err is not None:
        logging.error("%s", err)
    else:
        try:
            index.datasets.add(dataset, sources_policy=sources_policy)
        except changes.DocumentMismatchError as e:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as e:
            err = e
            logging.error("Unhandled exception %s", e)

    return dataset, err
Beispiel #7
0
def add_dataset(doc):
    dc = datacube.Datacube(config=config.DATACUBE_CONF)
    index = dc.index
    resolver = Doc2Dataset(index)
    dataset, error = resolver(doc, 'file:///tmp/test-dataset.json')
    print('add dataset', dataset)
    index.datasets.add(dataset)
Beispiel #8
0
def item2dataset_cli(stac_collection, dc_product, url, outdir, max_items,
                     engine_file, datacube_config, verbose, access_token,
                     advanced_filter):
    _filter = {"collections": [stac_collection]}
    if advanced_filter:
        _filter = {**_filter, **prepare_advanced_filter(advanced_filter)}

    stac_service = stac.STAC(url, False, access_token=access_token)
    dc_index = datacube_index(datacube_config)

    features = create_feature_collection_from_stac_elements(
        stac_service, int(max_items), _filter)
    odc_datasets = stac2odc.item.item2dataset(engine_file,
                                              dc_product,
                                              features,
                                              dc_index,
                                              verbose=verbose)
    odc_datasets_definition_files = write_odc_element_in_yaml_file(
        odc_datasets, outdir)

    # add datasets definitions on datacube index
    # code adapted from: https://github.com/opendatacube/datacube-core/blob/develop/datacube/scripts/dataset.py
    ds_resolve = Doc2Dataset(dc_index, [dc_product])
    doc_stream = remap_uri_from_doc(
        ui_path_doc_stream(odc_datasets_definition_files, uri=True))
    datasets_on_stream = dataset_stream(doc_stream, ds_resolve)

    logger_message(f"Adding datasets", logger.info, True)
    for dataset in datasets_on_stream:
        try:
            dc_index.datasets.add(dataset, with_lineage=True)
        except (ValueError, MissingRecordError):
            logger_message(f"Error to add dataset ({dataset.local_uri})",
                           logger.warning, True)
Beispiel #9
0
def dump_to_odc(
    document_stream,
    dc: Datacube,
    products: list,
    transform=None,
    update=False,
    update_if_exists=False,
    allow_unsafe=False,
    **kwargs,
) -> Tuple[int, int]:
    doc2ds = Doc2Dataset(dc.index, products=products, **kwargs)

    ds_added = 0
    ds_failed = 0
    uris_docs = parse_doc_stream(stream_docs(document_stream),
                                 on_error=doc_error,
                                 transform=transform)

    for uri, metadata in uris_docs:
        try:
            index_update_dataset(metadata, uri, dc, doc2ds, update,
                                 update_if_exists, allow_unsafe)
            ds_added += 1
        except (IndexingException) as e:
            logging.exception(f"Failed to index dataset {uri} with error {e}")
            ds_failed += 1

    return ds_added, ds_failed
def show(index, path):

    file_paths = find_lpdaac_file_paths(Path(path))
    print(file_paths)

    _ = Doc2Dataset(index)
    for file_path in file_paths:
        doc = generate_lpdaac_doc(file_path)
        print_dict(doc)
Beispiel #11
0
def archive_document(doc, uri, index, sources_policy, require_lineage):
    def get_ids(dataset):
        ds = index.datasets.get(dataset.id, include_sources=True)
        for source in ds.sources.values():
            yield source.id
        yield dataset.id

    resolver = Doc2Dataset(index)
    dataset, err  = resolver(doc, uri)
    index.datasets.archive(get_ids(dataset))
    logging.info("Archiving %s and all sources of %s", dataset.id, dataset.id)
Beispiel #12
0
def stac_api_to_odc(
    dc: Datacube,
    update_if_exists: bool,
    config: dict,
    catalog_href: str,
    allow_unsafe: bool = True,
    rewrite: Optional[Tuple[str, str]] = None,
) -> Tuple[int, int]:
    doc2ds = Doc2Dataset(dc.index)
    client = Client.open(catalog_href)

    search = client.search(**config)
    n_items = search.matched()
    if n_items is not None:
        logging.info("Found {} items to index".format(n_items))
        if n_items == 0:
            logging.warning("Didn't find any items, finishing.")
            return 0, 0
    else:
        logging.warning("API did not return the number of items.")

    # Do the indexing of all the things
    success = 0
    failure = 0

    sys.stdout.write("\rIndexing from STAC API...\n")
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        future_to_item = {
            executor.submit(
                process_item,
                item,
                dc,
                doc2ds,
                update_if_exists=update_if_exists,
                allow_unsafe=allow_unsafe,
                rewrite=rewrite,
            ): item.id
            for item in search.get_all_items()
        }
        for future in concurrent.futures.as_completed(future_to_item):
            item = future_to_item[future]
            try:
                _ = future.result()
                success += 1
                if success % 10 == 0:
                    sys.stdout.write(f"\rAdded {success} datasets...")
            except Exception as e:
                logging.exception(f"Failed to handle item {item} with exception {e}")
                failure += 1
    sys.stdout.write("\r")

    return success, failure
Beispiel #13
0
def index_cmd(index, product_names, exclude_product_names, auto_match,
              auto_add_lineage, verify_lineage, dry_run, ignore_lineage,
              confirm_ignore_lineage, dataset_paths):

    if confirm_ignore_lineage is False and ignore_lineage is True:
        if sys.stdin.isatty():
            confirmed = click.confirm(
                "Requested to skip lineage information, Are you sure?",
                default=False)
            if not confirmed:
                click.echo('OK aborting', err=True)
                sys.exit(1)
        else:
            click.echo(
                "Use --confirm-ignore-lineage from non-interactive scripts. Aborting."
            )
            sys.exit(1)

        confirm_ignore_lineage = True

    if auto_match is True:
        _LOG.warning(
            "--auto-match option is deprecated, update your scripts, behaviour is the same without it"
        )

    try:
        ds_resolve = Doc2Dataset(index,
                                 product_names,
                                 exclude_products=exclude_product_names,
                                 skip_lineage=confirm_ignore_lineage,
                                 fail_on_missing_lineage=not auto_add_lineage,
                                 verify_lineage=verify_lineage)
    except ValueError as e:
        _LOG.error(e)
        sys.exit(2)

    def run_it(dataset_paths):
        doc_stream = ui_path_doc_stream(dataset_paths, logger=_LOG, uri=True)
        dss = dataset_stream(doc_stream, ds_resolve)
        index_datasets(dss,
                       index,
                       auto_add_lineage=auto_add_lineage,
                       dry_run=dry_run)

    # If outputting directly to terminal, show a progress bar.
    if sys.stdout.isatty():
        with click.progressbar(dataset_paths, label='Indexing datasets') as pp:
            run_it(pp)
    else:
        run_it(dataset_paths)
Beispiel #14
0
def cop_dem_to_dc(
    dc: Datacube,
    product: str,
    bounding_box,
    limit: int,
    update: bool,
    n_workers: int = 100,
) -> Tuple[int, int]:
    doc2ds = Doc2Dataset(dc.index)

    # Get a generator of (uris)
    uris_tiles = list(get_dem_tile_uris(bounding_box, product))
    if limit:
        uris_tiles = uris_tiles[0:limit]

    # Do the indexing of all the things
    success = 0
    failure = 0

    sys.stdout.write(
        f"Starting Cop DEM indexing with {n_workers} workers...\n")

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=n_workers) as executor:
        future_to_uri = {
            executor.submit(process_uri_tile,
                            uri_tile,
                            product,
                            dc,
                            doc2ds,
                            update_if_exists=update): uri_tile[0]
            for uri_tile in uris_tiles
        }
        for future in concurrent.futures.as_completed(future_to_uri):
            uri = future_to_uri[future]
            try:
                _ = future.result()
                success += 1
                if success % 10 == 0:
                    sys.stdout.write(f"\rAdded {success} datasets...")
            except rasterio.errors.RasterioIOError:
                logging.info(f"Couldn't find file for {uri}")
            except Exception as e:
                logging.exception(
                    f"Failed to handle uri {uri} with exception {e}")
                failure += 1
    sys.stdout.write("\r")

    return success, failure
Beispiel #15
0
def add_dataset(doc, uri, index, sources_policy):
    logging.info("Indexing %s", uri)
    resolver = Doc2Dataset(index)
    dataset, err = resolver(doc, uri)
    if err is not None:
        logging.error("%s", err)
    try:
        index.datasets.add(dataset,
                           sources_policy=sources_policy)  # Source policy to be checked in sentinel 2 datase types
    except changes.DocumentMismatchError as e:
        index.datasets.update(dataset, {tuple(): changes.allow_any})
    except Exception as e:
        logging.error("Unhandled exception %s", e)

    return uri
def index_data(index, path):
    path = Path(path)
    datasets = find_datasets(path)

    resolver = Doc2Dataset(index)
    for name, dataset in datasets.items():
        doc = generate_dataset_doc(name, dataset)
        print_dict(doc)
        dataset, err = resolver(doc, path.as_uri())

        if err is not None:
            logging.error("%s", err)
        try:
            index.datasets.add(dataset)
        except Exception as e:
            logging.error("Couldn't index %s%s", path, name)
            logging.exception("Exception", e)
def index_data(index, path):
    file_paths = find_lpdaac_file_paths(Path(path))
    print(file_paths)

    resolver = Doc2Dataset(index)
    for file_path in file_paths:
        doc = generate_lpdaac_doc(file_path)
        print_dict(doc)
        dataset, err = resolver(doc, file_path.as_uri())

        if err is not None:
            logging.error("%s", err)
        try:
            index.datasets.add(dataset)
        except Exception as e:
            logging.error("Couldn't index %s", file_path)
            logging.exception("Exception", e)
def index_dataset(dataset_dict, s3_path):
    dc = datacube.Datacube()
    index = dc.index
    resolver = Doc2Dataset(index)
    dataset, err = resolver(dataset_dict, s3_path)
    if err is not None:
        logging.error("%s", err)
    else:
        try:
            index.datasets.add(dataset)
        except changes.DocumentMismatchError as e:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as e:
            err = e
            logging.error("Unhandled exception {}".format(e))

    return dataset, err
Beispiel #19
0
def item_to_dataset(
        dc_index: index.Index,
        product_name: str,
        item: dict
) -> model.Dataset:

    doc2ds = Doc2Dataset(index=dc_index, products=[product_name])
    uri, relative = guess_location(item)

    if relative:
        metadata = stac_transform(item)
    else:
        metadata = stac_transform_absolute(item)

    ds, err = doc2ds(metadata, uri)

    if ds is not None:
        return ds
Beispiel #20
0
def add_dataset(doc, uri, index, sources_policy, require_lineage):
    logging.info("Indexing %s", uri)
    skip_lineage = not require_lineage
    resolver = Doc2Dataset(index, skip_lineage=skip_lineage)
    dataset, err  = resolver(doc, uri)
    if err is not None:
        logging.error("%s", err)
    else:
        try:
            # TODO: Sources policy to be checked in sentinel 2 dataset types
            index.datasets.add(dataset, sources_policy=sources_policy, with_lineage=require_lineage)
        except changes.DocumentMismatchError as e:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as e:
            err = e
            logging.error("Unhandled exception %s", e)

    return dataset, err
Beispiel #21
0
def ingest_sentinel1_grd_50m_beta0(path, uuid):
    dc = datacube.Datacube()
    resolver = Doc2Dataset(dc.index)
    projection, extent, (t0, t1) = get_geometry(path)
    images = {v: {'path': path, 'layer': i + 1} for i, v in enumerate(bands)}
    p = Path(path)
    scene_name = p.stem[:-11]

    result = {
        # 'id': str(uuid.uuid4()), # Generate random uuid
        'id': str(uuid),
        'processing_level': "Level-1",
        'product_type': "sentinel_1_grd_50m_beta0",
        'creation_dt': t0,
        'platform': {
            'code': 'SENTINEL_1A'
        },
        'instrument': {
            'name': 'SAR'
        },
        'extent': {
            'coord': extent,
            'from_dt': str(t0),
            'to_dt': str(t1),
            'center_dt': str(t0 + (t1 - t0) / 2)
        },
        'format': {
            'name': 'GeoTIFF'
        },  # ENVI or BEAM-DIMAP ?
        'grid_spatial': {
            'projection': projection
        },
        'image': {
            'bands': images
        },
        'lineage': {
            'source_datasets': {},
            'ga_label': scene_name
        }
    }
    print(result)
    dataset, _ = resolver(result, '')
    dc.index.datasets.add(dataset)
    return True
    def _populate_from_dump(expected_type: str, dump_path: Path):
        ls8_nbar_scene = module_dea_index.products.get_by_name(expected_type)
        dataset_count = 0

        create_dataset = Doc2Dataset(module_dea_index)

        for _, doc in read_documents(dump_path):
            label = doc['ga_label'] if ('ga_label' in doc) else doc['id']
            # type: Tuple[Dataset, str]
            dataset, err = create_dataset(doc, f"file://example.com/test_dataset/{label}")
            assert dataset is not None, err
            assert dataset.type.name == expected_type
            created = module_dea_index.datasets.add(dataset)

            assert created.type.name == ls8_nbar_scene.name
            dataset_count += 1

        print(f"Populated {dataset_count} of {expected_type}")
        return dataset_count
def _populate_from_dump(session_dea_index, expected_type: str,
                        dump_path: Path):
    ls8_nbar_scene = session_dea_index.products.get_by_name(expected_type)
    dataset_count = 0

    create_dataset = Doc2Dataset(session_dea_index)

    for _, doc in read_documents(dump_path):
        label = doc["ga_label"] if ("ga_label" in doc) else doc["id"]
        dataset, err = create_dataset(
            doc, f"file://example.com/test_dataset/{label}")
        assert dataset is not None, err
        created = session_dea_index.datasets.add(dataset)

        assert created.type.name == ls8_nbar_scene.name
        dataset_count += 1

    print(f"Populated {dataset_count} of {expected_type}")
    return dataset_count
def archive_document(doc, uri, index, sources_policy):
    """Archive dataset
    
    :param doc: Dict of parameters that reference the dataset
    :type doc: dict
    :param uri: URI to metadata file for the tile
    :type uri: str
    :param index: Datacube index
    :type index: datacube.index
    :param sources_policy: Source policy  
    :type sources_policy: str
    """
    def get_ids(dataset):
        ds = index.datasets.get(dataset.id, include_sources=True)
        for source in ds.sources.values():
            yield source.id
        yield dataset.id

    resolver = Doc2Dataset(index)
    dataset, err = resolver(doc, uri)
    index.datasets.archive(get_ids(dataset))
    logging.info("Archiving %s and all sources of %s", dataset.id, dataset.id)
Beispiel #25
0
    def add_dataset(self,
                    eo3_doc: Dict,
                    uri: str = "",
                    **kwargs) -> (ODCDataset, Union[Exception, None]):
        """ Adds dataset to dcIndex """
        if not uri:
            uri = eo3_doc["uri"]
        LOGGER.debug(f"Indexing {uri}")
        index = self.dc.index
        resolver = Doc2Dataset(index, **kwargs)
        dataset, err = resolver(eo3_doc, uri)
        if err is not None:
            LOGGER.error(f"Error indexing {uri}: {err}")
            return dataset, err
        try:
            index.datasets.add(dataset)
        except DocumentMismatchError:
            index.datasets.update(dataset, {tuple(): changes.allow_any})
        except Exception as err:
            LOGGER.error(f"Unhandled exception {err}")
            pass

        return dataset, err
Beispiel #26
0
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str):
    """
    Index a dataset from a file uri.

    A better api should be pushed upstream to core: it currently only has a "scripts" implementation
    intended for cli use.
    """
    yaml_path = uri_to_local_path(uri)

    def load_datasets(path, ds_resolve):
        for uri, ds in ui_path_doc_stream(path):

            dataset, err = ds_resolve(ds, uri)

            if dataset is None:
                _LOG.error('dataset is empty', error=str(err))
                continue

            is_consistent, reason = check_dataset_consistent(dataset)
            if not is_consistent:
                _LOG.error("dataset inconsistency", dataset=dataset.id, reason=str(reason))
                continue

            yield dataset

    ds_resolve = Doc2Dataset(index)

    for d in load_datasets([yaml_path], ds_resolve):
        if d.id == dataset_id:
            try:
                index.datasets.add(d)
                _LOG.info("dataset indexing successful", dataset_id=dataset_id)
                break
            except ValueError as err:
                _LOG.error('failed to index dataset', dataset_id=dataset_id, error=err)
    else:
        raise RuntimeError('dataset not found at path: %s, %s' % (dataset_id, uri))
Beispiel #27
0
def stac_api_to_odc(
    dc: Datacube,
    products: list,
    limit: int,
    update: bool,
    allow_unsafe: bool,
    config: dict,
    **kwargs,
) -> Tuple[int, int]:
    # QA the BBOX
    if config["bbox"]:
        assert (
            len(config["bbox"]) == 4
        ), "Bounding box must be of the form lon-min,lat-min,lon-max,lat-max"

    # QA the search
    srch = Search().search(**config)
    n_items = srch.found()
    logging.info("Found {} items to index".format(n_items))
    if n_items > 10000:
        logging.warning(
            "More than 10,000 items were returned by your query, which is greater than the API limit"
        )

    if n_items == 0:
        logging.warning("Didn't find any items, finishing.")
        return 0, 0

    # Get a generator of (stac, uri, relative_uri) tuples
    potential_items = get_items(srch, limit)

    # Get a generator of (dataset, uri)
    doc2ds = Doc2Dataset(dc.index, **kwargs)
    datasets = transform_items(doc2ds, potential_items)

    # Do the indexing of all the things
    return index_update_datasets(dc, datasets, update, allow_unsafe)
Beispiel #28
0
def from_json_lines(lines, index, **kwargs):
    doc2ds = Doc2Dataset(index, **kwargs)

    for lineno, l in enumerate(lines):
        try:
            doc = json.loads(l)
        except json.JSONDecodeError as e:
            print('Error[%d]: %s' % (lineno, str(e)))

        uri = toolz.get_in(['uris', 0], doc)
        if uri is None:
            print('Error[%d]: missing uri' % lineno)
            continue

        metadata = doc.get('metadata')
        if metadata is None:
            print('Error[%d]: missing metadata' % lineno)
            continue

        ds, err = doc2ds(metadata, uri)
        if ds is not None:
            yield ds
        else:
            print('Error[%d]: %s' % (lineno, err))
Beispiel #29
0
def cli(input_directory, update_if_exists, allow_unsafe, stac, glob):

    dc = datacube.Datacube()
    doc2ds = Doc2Dataset(dc.index)

    if glob is None:
        glob = "**/*.json" if stac else "**/*.yaml"

    files_to_process = _find_files(input_directory, glob, stac=stac)

    added, failed = 0, 0

    for in_file in files_to_process:
        with in_file.open() as f:
            try:
                if in_file.endswith(".yml") or in_file.endswith(".yaml"):
                    metadata = yaml.safe_load(f, Loader=Loader)
                else:
                    metadata = json.load(f)
                # Do the STAC Transform if it's flagged
                if stac:
                    metadata = stac_transform(metadata)
                index_update_dataset(
                    metadata,
                    in_file.absolute().as_uri(),
                    dc=dc,
                    doc2ds=doc2ds,
                    update_if_exists=update_if_exists,
                    allow_unsafe=allow_unsafe,
                )
                added += 1
            except Exception as e:
                logging.exception(f"Failed to add dataset {in_file} with error {e}")
                failed += 1

    logging.info(f"Added {added} and failed {failed} datasets.")
def test_dataset_add(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    r = clirunner(['dataset', 'add', p.datasets], expect_success=False)
    assert r.exit_code != 0
    assert 'Found no products' in r.output

    clirunner(['metadata', 'add', p.metadata])
    clirunner(['product', 'add', p.products])
    clirunner(['dataset', 'add', p.datasets])
    clirunner(['dataset', 'add', p.datasets_bad1])
    clirunner(['dataset', 'add', p.datasets_eo3])

    ds = load_dataset_definition(p.datasets)
    ds_bad1 = load_dataset_definition(p.datasets_bad1)

    # Check .hl.Doc2Dataset
    doc2ds = Doc2Dataset(index)
    _ds, _err = doc2ds(ds.doc, 'file:///something')
    assert _err is None
    assert str(_ds.id) == ds.id
    assert _ds.metadata_doc == ds.doc

    # Check dataset search

    r = clirunner(['dataset', 'search'], expect_success=True)
    assert ds.id in r.output
    assert ds_bad1.id not in r.output
    assert ds.sources['ab'].id in r.output
    assert ds.sources['ac'].sources['cd'].id in r.output

    r = clirunner(['dataset', 'info', '-f', 'csv', ds.id])
    assert ds.id in r.output

    r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id])
    assert ds.sources['ae'].id in r.output

    r = clirunner([
        'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id
    ])
    assert ds.id in r.output

    ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))
    assert ds_.id == ds.id

    x = index.datasets.get(ds.id, include_sources=True)
    assert str(x.sources['ab'].id) == ds.sources['ab'].id
    assert str(
        x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id

    check_skip_lineage_test(clirunner, index)
    check_no_product_match(clirunner, index)
    check_with_existing_lineage(clirunner, index)
    check_inconsistent_lineage(clirunner, index)
    check_missing_metadata_doc(clirunner)
    check_missing_lineage(clirunner, index)
    check_no_confirm(clirunner, p.datasets)
    check_bad_yaml(clirunner, index)

    # check --product=nosuchproduct
    r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets],
                  expect_success=False)

    assert "ERROR Supplied product name" in r.output
    assert r.exit_code != 0

    # Check that deprecated option is accepted
    r = clirunner(['dataset', 'add', '--auto-match', p.datasets])
    assert 'WARNING --auto-match option is deprecated' in r.output

    # test dataset add eo3
    r = clirunner(['dataset', 'add', p.datasets_eo3])
    assert r.exit_code == 0

    ds_eo3 = load_dataset_definition(p.datasets_eo3)
    assert ds_eo3.location is not None

    _ds = index.datasets.get(ds_eo3.id, include_sources=True)
    assert sorted(_ds.sources) == ['a', 'bc1', 'bc2']
    assert _ds.crs == 'EPSG:3857'
    assert _ds.extent is not None
    assert _ds.extent.crs == _ds.crs
    assert _ds.uris == [ds_eo3.location]
    assert 'location' not in _ds.metadata_doc