def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]: """ Compare the index and filesystem contents for the given uris, yielding Mismatches of any differences. """ # pylint: disable=protected-access index = Index(PostgresDb(PostgresDb._create_engine(index_url))) def ids(datasets): return [d.id for d in datasets] path = uri_to_local_path(uri) log = _LOG.bind(path=path) log.debug("index.get_dataset_ids_for_uri") indexed_datasets = set(get_datasets_for_uri(index, uri)) datasets_in_file = set() # type: Set[DatasetLite] if path.exists(): try: datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path))) except InvalidDocException as e: # Should we do something with indexed_datasets here? If there's none, we're more willing to trash. log.info("invalid_path", error_args=e.args) yield UnreadableDataset(None, uri) return log.info("dataset_ids", indexed_dataset_ids=ids(indexed_datasets), file_ids=ids(datasets_in_file)) if validate_data: validation_success = validate.validate_dataset(path, log=log) if not validation_success: yield InvalidDataset(None, uri) return for indexed_dataset in indexed_datasets: # Does the dataset exist in the file? if indexed_dataset in datasets_in_file: if indexed_dataset.is_archived: yield ArchivedDatasetOnDisk(indexed_dataset, uri) else: yield LocationMissingOnDisk(indexed_dataset, uri) # For all file ids not in the index. file_ds_not_in_index = datasets_in_file.difference(indexed_datasets) if not file_ds_not_in_index: log.info("no mismatch found (dataset already indexed)") for dataset in file_ds_not_in_index: # If it's already indexed, we just need to add the location. indexed_dataset = index.datasets.get(dataset.id) if indexed_dataset: log.info("location_not_indexed", indexed_dataset=indexed_dataset) yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri) else: log.info("dataset_not_index", dataset=dataset, uri=uri) yield DatasetNotIndexed(dataset, uri)
def test_index_duplicate_dataset(index: Index, initialised_postgres_db: PostgresDb, local_config, default_metadata_type)->None: dataset_type = index.products.add_document(_pseudo_telemetry_dataset_type) assert not index.datasets.has(_telemetry_uuid) with initialised_postgres_db.begin() as transaction: was_inserted = transaction.insert_dataset( _telemetry_dataset, _telemetry_uuid, dataset_type.id ) assert was_inserted assert index.datasets.has(_telemetry_uuid) # Insert again. with initialised_postgres_db.connect() as connection: was_inserted = connection.insert_dataset( _telemetry_dataset, _telemetry_uuid, dataset_type.id ) assert was_inserted is False assert index.datasets.has(_telemetry_uuid)
def pseudo_ls8_dataset4(index: Index, initialised_postgres_db: PostgresDb, pseudo_ls8_type: DatasetType, pseudo_ls8_dataset2: Dataset) -> Dataset: # Same as 2, but a different path/row id_ = str(uuid.uuid4()) dataset_doc = copy.deepcopy(pseudo_ls8_dataset2.metadata_doc) dataset_doc['id'] = id_ dataset_doc['image'] = { 'satellite_ref_point_start': { 'x': 116, 'y': 85 }, 'satellite_ref_point_end': { 'x': 116, 'y': 87 }, } with initialised_postgres_db.connect() as connection: was_inserted = connection.insert_dataset(dataset_doc, id_, pseudo_ls8_type.id) assert was_inserted d = index.datasets.get(id_) # The dataset should have been matched to the telemetry type. assert d.type.id == pseudo_ls8_type.id return d
def from_config(cls, config, application_name=None, validate_connection=True): db = PostgresDb.from_config(config, application_name=application_name, validate_connection=validate_connection) return cls(db)
def telemetry_dataset(index: Index, initialised_postgres_db: PostgresDb, default_metadata_type) -> Dataset: dataset_type = index.products.add_document(_pseudo_telemetry_dataset_type) assert not index.datasets.has(_telemetry_uuid) with initialised_postgres_db.begin() as transaction: was_inserted = transaction.insert_dataset(_telemetry_dataset, _telemetry_uuid, dataset_type.id) assert was_inserted return index.datasets.get(_telemetry_uuid)
def db_fixture_instance(request): local_config: LocalConfig = request.getfixturevalue( config_fixture_name) db = PostgresDb.from_config(local_config, application_name='dea-test-run', validate_connection=False) # Drop and recreate tables so our tests have a clean db. with db.connect() as connection: _core.drop_db(connection._connection) remove_dynamic_indexes() # Disable informational messages since we're doing this on every test run. with _increase_logging(_core._LOG) as _: _core.ensure_db(db._engine) # We don't need informational create/drop messages for every config change. _dynamic._LOG.setLevel(logging.WARN) yield db db.close()
def test_transactions(index: Index, initialised_postgres_db: PostgresDb, local_config, default_metadata_type) -> None: assert not index.datasets.has(_telemetry_uuid) dataset_type = index.products.add_document(_pseudo_telemetry_dataset_type) with initialised_postgres_db.begin() as transaction: was_inserted = transaction.insert_dataset(_telemetry_dataset, _telemetry_uuid, dataset_type.id) assert was_inserted assert transaction.contains_dataset(_telemetry_uuid) # Normal DB uses a separate connection: No dataset visible yet. assert not index.datasets.has(_telemetry_uuid) transaction.rollback() # Should have been rolled back. assert not index.datasets.has(_telemetry_uuid)
def uninitialised_postgres_db(local_config, request): """ Return a connection to an empty PostgreSQL database """ timezone = request.param db = PostgresDb.from_config(local_config, application_name='test-run', validate_connection=False) # Drop tables so our tests have a clean db. # with db.begin() as c: # Creates a new PostgresDbAPI, by passing a new connection to it _core.drop_db(db._engine) db._engine.execute('alter database %s set timezone = %r' % (local_config['db_database'], timezone)) # We need to run this as well, I think because SQLAlchemy grabs them into it's MetaData, # and attempts to recreate them. WTF TODO FIX remove_dynamic_indexes() yield db # with db.begin() as c: # Drop SCHEMA _core.drop_db(db._engine) db.close()
def get_dataset_fields(cls, doc): return PostgresDb.get_dataset_fields(doc)