def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]: """ Compare the index and filesystem contents for the given uris, yielding Mismatches of any differences. """ # pylint: disable=protected-access index = Index(PostgresDb(PostgresDb._create_engine(index_url))) def ids(datasets): return [d.id for d in datasets] path = uri_to_local_path(uri) log = _LOG.bind(path=path) log.debug("index.get_dataset_ids_for_uri") indexed_datasets = set(get_datasets_for_uri(index, uri)) datasets_in_file = set() # type: Set[DatasetLite] if path.exists(): try: datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path))) except InvalidDocException as e: # Should we do something with indexed_datasets here? If there's none, we're more willing to trash. log.info("invalid_path", error_args=e.args) yield UnreadableDataset(None, uri) return log.info("dataset_ids", indexed_dataset_ids=ids(indexed_datasets), file_ids=ids(datasets_in_file)) if validate_data: validation_success = validate.validate_dataset(path, log=log) if not validation_success: yield InvalidDataset(None, uri) return for indexed_dataset in indexed_datasets: # Does the dataset exist in the file? if indexed_dataset in datasets_in_file: if indexed_dataset.is_archived: yield ArchivedDatasetOnDisk(indexed_dataset, uri) else: yield LocationMissingOnDisk(indexed_dataset, uri) # For all file ids not in the index. file_ds_not_in_index = datasets_in_file.difference(indexed_datasets) if not file_ds_not_in_index: log.info("no mismatch found (dataset already indexed)") for dataset in file_ds_not_in_index: # If it's already indexed, we just need to add the location. indexed_dataset = index.datasets.get(dataset.id) if indexed_dataset: log.info("location_not_indexed", indexed_dataset=indexed_dataset) yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri) else: log.info("dataset_not_index", dataset=dataset, uri=uri) yield DatasetNotIndexed(dataset, uri)
def get_unknown_dataset_ids(index, uri): """Get ids of datasets in the file that have never been indexed""" on_disk_dataset_ids = set( paths.get_path_dataset_ids(uri_to_local_path(uri))) unknown_ids = set() for dataset_id in on_disk_dataset_ids: if not index.datasets.has(dataset_id): unknown_ids.add(dataset_id) return unknown_ids
def _should_restore(index, trashed_nc): dataset_ids = paths.get_path_dataset_ids(trashed_nc) original_path = paths.get_original_path(trashed_nc) for dataset_id in dataset_ids: dataset = index.datasets.get(dataset_id) if dataset.is_archived: _LOG.debug("dataset.skip.archived", dataset_id=dataset.id) continue if original_path.as_uri() not in dataset.uris: _LOG.debug("dataset.skip.unknown_location", dataset_id=dataset.id) continue # There's something else in the location? if original_path.exists(): _LOG.debug("dataset.skip.original_exists", dataset_id=dataset.id) continue # We've found an indexed, active dataset in the file, so restore. return original_path