Beispiel #1
0
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]:
    """
    Compare the index and filesystem contents for the given uris,
    yielding Mismatches of any differences.
    """

    # pylint: disable=protected-access
    index = Index(PostgresDb(PostgresDb._create_engine(index_url)))

    def ids(datasets):
        return [d.id for d in datasets]

    path = uri_to_local_path(uri)
    log = _LOG.bind(path=path)
    log.debug("index.get_dataset_ids_for_uri")
    indexed_datasets = set(get_datasets_for_uri(index, uri))

    datasets_in_file = set()  # type: Set[DatasetLite]
    if path.exists():
        try:
            datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path)))
        except InvalidDocException as e:
            # Should we do something with indexed_datasets here? If there's none, we're more willing to trash.
            log.info("invalid_path", error_args=e.args)
            yield UnreadableDataset(None, uri)
            return

        log.info("dataset_ids",
                 indexed_dataset_ids=ids(indexed_datasets),
                 file_ids=ids(datasets_in_file))

        if validate_data:
            validation_success = validate.validate_dataset(path, log=log)
            if not validation_success:
                yield InvalidDataset(None, uri)
                return

    for indexed_dataset in indexed_datasets:
        # Does the dataset exist in the file?
        if indexed_dataset in datasets_in_file:
            if indexed_dataset.is_archived:
                yield ArchivedDatasetOnDisk(indexed_dataset, uri)
        else:
            yield LocationMissingOnDisk(indexed_dataset, uri)

    # For all file ids not in the index.
    file_ds_not_in_index = datasets_in_file.difference(indexed_datasets)

    if not file_ds_not_in_index:
        log.info("no mismatch found (dataset already indexed)")

    for dataset in file_ds_not_in_index:
        # If it's already indexed, we just need to add the location.
        indexed_dataset = index.datasets.get(dataset.id)
        if indexed_dataset:
            log.info("location_not_indexed", indexed_dataset=indexed_dataset)
            yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri)
        else:
            log.info("dataset_not_index", dataset=dataset, uri=uri)
            yield DatasetNotIndexed(dataset, uri)
Beispiel #2
0
def _trash_missing_dataset(mismatch: DatasetNotIndexed, index: Index):
    # If any (other) indexed datasets exist at the same location we can't trash it.
    datasets_at_location = list(get_datasets_for_uri(index, mismatch.uri))
    if datasets_at_location:
        _LOG.warning("do_trash_missing.indexed_siblings_exist",
                     uri=mismatch.uri)
        return

    trash_uri(mismatch.uri)