コード例 #1
0
def test_new_and_old_on_disk(test_dataset: DatasetForTests,
                             integration_test_data: Path,
                             other_dataset: DatasetForTests):
    # type: (Tuple[Collection, DatasetLite, str, Path]) -> None
    # ls8_collection, on_disk, on_disk_uri, root = syncable_environment

    old_indexed = DatasetLite(uuid.UUID('5294efa6-348d-11e7-a079-185e0f80a5c0'))

    # An indexed file not on disk, and disk file not in index.

    missing_dataset = other_dataset

    missing_dataset.add_to_index()

    # Make it missing
    shutil.rmtree(str(missing_dataset.copyable_path))

    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[
            missing_dataset.uri,
            test_dataset.uri
        ],
        expected_mismatches=[
            mm.LocationMissingOnDisk(old_indexed, missing_dataset.uri),
            mm.DatasetNotIndexed(test_dataset.dataset, test_dataset.uri)
        ],
        expected_index_result={
            test_dataset.dataset: (test_dataset.uri,),
            old_indexed: (),
            test_dataset.parent: (),
        },
        cache_path=integration_test_data,
        fix_settings=dict(index_missing=True, update_locations=True)
    )
コード例 #2
0
ファイル: scan.py プロジェクト: NMoghaddam/digitalearthau
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]:
    """
    Compare the index and filesystem contents for the given uris,
    yielding Mismatches of any differences.
    """

    # pylint: disable=protected-access
    index = Index(PostgresDb(PostgresDb._create_engine(index_url)))

    def ids(datasets):
        return [d.id for d in datasets]

    path = uri_to_local_path(uri)
    log = _LOG.bind(path=path)
    log.debug("index.get_dataset_ids_for_uri")
    indexed_datasets = set(get_datasets_for_uri(index, uri))

    datasets_in_file = set()  # type: Set[DatasetLite]
    if path.exists():
        try:
            datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path)))
        except InvalidDocException as e:
            # Should we do something with indexed_datasets here? If there's none, we're more willing to trash.
            log.info("invalid_path", error_args=e.args)
            yield UnreadableDataset(None, uri)
            return

        log.info("dataset_ids",
                 indexed_dataset_ids=ids(indexed_datasets),
                 file_ids=ids(datasets_in_file))

        if validate_data:
            validation_success = validate.validate_dataset(path, log=log)
            if not validation_success:
                yield InvalidDataset(None, uri)
                return

    for indexed_dataset in indexed_datasets:
        # Does the dataset exist in the file?
        if indexed_dataset in datasets_in_file:
            if indexed_dataset.is_archived:
                yield ArchivedDatasetOnDisk(indexed_dataset, uri)
        else:
            yield LocationMissingOnDisk(indexed_dataset, uri)

    # For all file ids not in the index.
    file_ds_not_in_index = datasets_in_file.difference(indexed_datasets)

    if not file_ds_not_in_index:
        log.info("no mismatch found (dataset already indexed)")

    for dataset in file_ds_not_in_index:
        # If it's already indexed, we just need to add the location.
        indexed_dataset = index.datasets.get(dataset.id)
        if indexed_dataset:
            log.info("location_not_indexed", indexed_dataset=indexed_dataset)
            yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri)
        else:
            log.info("dataset_not_index", dataset=dataset, uri=uri)
            yield DatasetNotIndexed(dataset, uri)
コード例 #3
0
def test_archived_on_disk(test_dataset: DatasetForTests,
                          integration_test_data: Path):
    # type: (Tuple[Collection, DatasetLite, str, Path]) -> None
    """
    A an already-archived dataset on disk. Should report it, but not touch the file (trash_archived is false)
    """
    # archived_on_disk = DatasetLite(on_disk.id, archived_time=(datetime.utcnow() - timedelta(days=5)))
    test_dataset.add_to_index()
    test_dataset.archive_in_index()
    archived_time = test_dataset.get_index_record().archived_time

    assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location should exist before test begins."
    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[
            test_dataset.uri
        ],
        expected_mismatches=[
            mm.ArchivedDatasetOnDisk(DatasetLite(test_dataset.dataset.id, archived_time), test_dataset.uri),
        ],
        expected_index_result={
            # Not active in index, as it's archived.
            # on_disk: (on_disk_uri,),
            # But the parent dataset still is:
            test_dataset.parent: (),
        },
        cache_path=integration_test_data,
        fix_settings=dict(index_missing=True, update_locations=True)
    )
    assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location shouldn't be touched"
コード例 #4
0
    def from_dict(row: dict):

        mismatch_class = getattr(sys.modules[__name__],
                                 strutils.under2camel(row['name']))
        dataset_id = row['dataset_id'].strip()

        dataset = None
        if dataset_id and dataset_id != 'None':
            dataset = DatasetLite(UUID(dataset_id))

        return mismatch_class(dataset, row['uri'].strip())
コード例 #5
0
def freeze_index(index: Index) -> Mapping[DatasetLite, Iterable[str]]:
    """
    All contained (dataset_id, [location]) values, to check test results.
    """
    return dict(
        (
            DatasetLite(dataset.id, archived_time=dataset.archived_time),
            tuple(dataset.uris)
        )
        for dataset in index.datasets.search()
    )
コード例 #6
0
def test_is_trashed(test_dataset: DatasetForTests, integration_test_data: Path,
                    archived_dt, expect_to_be_trashed):
    root = integration_test_data

    # Same test, but trash_archived=True, so it should be renamed to the.
    register_base_directory(root)
    test_dataset.add_to_index()
    test_dataset.archive_in_index(archived_dt=archived_dt)

    archived_on_disk = DatasetLite(test_dataset.dataset.id,
                                   archived_time=archived_dt)

    trashed_path = test_dataset.base_path.joinpath(*_TRASH_PREFIX,
                                                   *test_dataset.path_offset)

    # Before the test, file is in place and nothing trashed.
    assert test_dataset.path.exists(
    ), "On-disk location should exist before test begins."
    assert not trashed_path.exists(), "Trashed file shouldn't exit."
    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[test_dataset.uri],
        expected_mismatches=[
            mm.ArchivedDatasetOnDisk(archived_on_disk, test_dataset.uri),
        ],
        expected_index_result={
            # Archived: shouldn't be active in index.
            # on_disk: (on_disk_uri,),
            # Prov parent should still exist as it wasn't archived.
            test_dataset.parent: (),
        },
        cache_path=root,
        fix_settings=dict(index_missing=True,
                          update_locations=True,
                          trash_archived=True))

    # Show output structure for debugging
    # print("Output structure")
    # for p in paths.list_file_paths(root):
    #    print(f"\t{p}")

    if expect_to_be_trashed:
        assert trashed_path.exists(), "File isn't in trash."
        assert not test_dataset.path.exists(
        ), "On-disk location still exists (should have been moved to trash)."
    else:
        assert not trashed_path.exists(), "File shouldn't have been trashed."
        assert test_dataset.path.exists(
        ), "On-disk location should still be in place."
コード例 #7
0
def test_load_dump_mismatch():
    mismatch = DatasetNotIndexed(
        DatasetLite(UUID("c98c3f2e-add7-4b34-9c9f-2cb8c7f806d2")),
        uri=
        'file:///g/data/fk4/datacube/002/LS5_TM_FC/-17_-31/LS5_TM_FC_3577_-17_-31_19920722013931500000.nc'
    )
    row = mismatch.to_dict()
    assert row == {
        'name':
        'dataset_not_indexed',
        'dataset_id':
        'c98c3f2e-add7-4b34-9c9f-2cb8c7f806d2',
        'uri':
        'file:///g/data/fk4/datacube/002/LS5_TM_FC/-17_-31/LS5_TM_FC_3577_-17_-31_19920722013931500000.nc',
    }

    deserialised_mismatch = Mismatch.from_dict(row)
    assert deserialised_mismatch == mismatch
    assert deserialised_mismatch.__dict__ == mismatch.__dict__
コード例 #8
0
def test_load_from_file():
    root = write_files({
        'outputs.jsonl':
        """
{"name":"archived_dataset_on_disk","dataset_id":"582e9a74-d343-42d2-9105-a248b4b04f4a",\
"uri":"file:///g/data/fk4/datacube/002/LS5_TM_FC/-10_-39/LS5_TM_FC_3577_-10_-39_19990918011811500000.nc"}
{"name":"unreadable_dataset", "dataset_id":"None","uri":\
"file:///g/data/fk4/datacube/002/LS5_TM_FC/0_-30/LS5_TM_FC_3577_0_-30_20080331005819500000.nc"}
        """
    })

    mismatches = list(mismatches_from_file(root.joinpath('outputs.jsonl')))

    assert mismatches == [
        ArchivedDatasetOnDisk(
            DatasetLite(UUID('582e9a74-d343-42d2-9105-a248b4b04f4a')),
            'file:///g/data/fk4/datacube/002/LS5_TM_FC/-10_-39/LS5_TM_FC_3577_-10_-39_19990918011811500000.nc'
        ),
        UnreadableDataset(
            None,
            'file:///g/data/fk4/datacube/002/LS5_TM_FC/0_-30/LS5_TM_FC_3577_0_-30_20080331005819500000.nc'
        )
    ]
コード例 #9
0
ファイル: conftest.py プロジェクト: jeremyh/digitalearthau
 def parent(self) -> Optional[DatasetLite]:
     """Source datasets that will be indexed if on_disk1 is indexed"""
     return DatasetLite(self.parent_id) if self.parent_id else None
コード例 #10
0
ファイル: conftest.py プロジェクト: jeremyh/digitalearthau
 def dataset(self):
     return DatasetLite(self.id_)
コード例 #11
0
ファイル: conftest.py プロジェクト: jeremyh/digitalearthau
def work_path(tmpdir):
    paths.NCI_WORK_ROOT = Path(tmpdir) / 'work'
    paths.NCI_WORK_ROOT.mkdir()
    # The default use of timestamp will collide when run quickly, as in unit tests.
    paths._JOB_WORK_OFFSET = '{output_product}-{task_type}-{request_uuid}'
    return paths.NCI_WORK_ROOT


@pytest.fixture
def integration_test_data(tmpdir):
    temp_data_dir = Path(tmpdir) / 'integration_data'
    shutil.copytree(INTEGRATION_TEST_DATA, temp_data_dir)
    return temp_data_dir


ON_DISK2_ID = DatasetLite(uuid.UUID('10c4a9fe-2890-11e6-8ec8-a0000100fe80'))

ON_DISK2_OFFSET = ('LS8_OLITIRS_OTH_P51_GALPGS01-032_114_080_20150924',
                   'ga-metadata.yaml')


class DatasetForTests(NamedTuple):
    """
    A test dataset, including the file location and collection it should belong to.

    When your test starts the dataset will be on disk but not yet indexed. Call add_to_index() and others as needed.

    All properties are recorded here separately so tests can verify them independently.
    """
    # The test collection this should belong to
    collection: Collection