Beispiel #1
0
def test_zarr_checksum_file_updater_delete_checksum_file(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.delete_checksum_file()
    assert updater.read_checksum_file() is None
    assert not upload.blob.storage.exists(updater.checksum_file_path)
Beispiel #2
0
def test_zarr_checksum_file_updater_read_checksum_file(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    listing = ZarrChecksumListing(checksums=ZarrChecksums(files=[upload.to_checksum()]), md5='b')

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.write_checksum_file(listing)
    assert updater.read_checksum_file() == listing
Beispiel #3
0
 def get_checksum(self, path: str | Path = ''):
     listing = ZarrChecksumFileUpdater(self, path).read_checksum_file()
     if listing is not None:
         return listing.md5
     else:
         zarr_file = self.upload_file_class.objects.create_zarr_upload_file(
             zarr_archive=self, path=path)
         # This will throw a 404 if the file doesn't exist
         return zarr_file.actual_etag()
Beispiel #4
0
def test_zarr_rest_upload_complete(
    authenticated_api_client,
    user,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    assign_perm('owner', user, zarr_archive.dandiset)
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    # Creating a zarr upload file means that the zarr has an upload in progress
    upload: ZarrUploadFile = zarr_upload_file_factory(
        zarr_archive=zarr_archive)
    assert zarr_archive.upload_in_progress
    assert zarr_archive.checksum == EMPTY_CHECKSUM

    resp = authenticated_api_client.post(
        f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/')
    assert resp.status_code == 201

    # Completing the upload means that it is no longer in progress
    assert not zarr_archive.upload_in_progress

    # zarr_upload_file_factory always generates paths in the form foo/bar.nwb
    parent_path = Path(upload.path).parent
    root_path = parent_path.parent

    # Verify the parent directory checksum file is correct
    serializer = ZarrJSONChecksumSerializer()
    expected_parent_listing = serializer.generate_listing(
        files=[upload.to_checksum()])
    assert (ZarrChecksumFileUpdater(
        zarr_archive,
        parent_path).read_checksum_file() == expected_parent_listing)
    # Verify that the root directory checksum file is correct
    expected_root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5)
    ])
    assert (ZarrChecksumFileUpdater(
        zarr_archive, root_path).read_checksum_file() == expected_root_listing)
    assert zarr_archive.checksum == expected_root_listing.md5
def test_ingest_zarr_archive_empty(zarr_archive_factory):
    zarr: ZarrArchive = zarr_archive_factory()

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is None
    assert ZarrJSONChecksumSerializer().generate_listing(
    ) == ZarrChecksumListing(
        checksums=ZarrChecksums(directories=[], files=[]),
        md5='481a2f77ab786a0f45aafd5db0971caa',
    )
Beispiel #6
0
def test_zarr_checksum_file_updater_checksum_file_path(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    assert (
        ZarrChecksumFileUpdater(zarr_archive, upload_parent_path).checksum_file_path
        == f'test-prefix/test-zarr-checksums/{zarr_archive.zarr_id}/{upload_parent_path}/.checksum'
    )

    # The root path is represented as Path('.')
    assert (
        ZarrChecksumFileUpdater(zarr_archive, Path('.')).checksum_file_path
        == f'test-prefix/test-zarr-checksums/{zarr_archive.zarr_id}/.checksum'
    )
    assert (
        ZarrChecksumFileUpdater(zarr_archive, Path('foo').parent).checksum_file_path
        == f'test-prefix/test-zarr-checksums/{zarr_archive.zarr_id}/.checksum'
    )
Beispiel #7
0
def test_zarr_explore_directory(
    api_client,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
    path,
    directories,
    files,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    a: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive,
                                                 path='foo/a')
    b: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive,
                                                 path='foo/b')
    c: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive,
                                                 path='foo/bar/c')

    # Write the checksum files
    ZarrChecksumUpdater(zarr_archive).update_file_checksums(
        [a.to_checksum(), b.to_checksum(),
         c.to_checksum()], )
    listing = ZarrChecksumFileUpdater(zarr_archive, path).read_checksum_file()

    resp = api_client.get(f'/api/zarr/{zarr_archive.zarr_id}.zarr/{path}', )
    assert resp.status_code == 200
    assert resp.json() == {
        'directories': [
            f'http://localhost:8000/api/zarr/{zarr_archive.zarr_id}.zarr/{dirpath}'
            for dirpath in directories
        ],
        'files': [
            f'http://localhost:8000/api/zarr/{zarr_archive.zarr_id}.zarr/{filepath}'
            for filepath in files
        ],
        'checksums': {
            **{
                Path(directory.path).name: directory.md5
                for directory in listing.checksums.directories
            },
            **{
                Path(file.path).name: file.md5
                for file in listing.checksums.files
            },
        },
        'checksum':
        listing.md5,
    }
Beispiel #8
0
def test_zarr_checksum_updater(storage, zarr_archive: ZarrArchive, zarr_upload_file_factory):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    a: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/a')
    b: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/b')
    c: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/bar/c')

    # Test update_checksums
    ZarrChecksumUpdater(zarr_archive).update_file_checksums(
        [a.to_checksum(), b.to_checksum(), c.to_checksum()],
    )

    serializer = ZarrJSONChecksumSerializer()
    # There should be 3 checksum files generated: foo/bar, foo, and the top level
    # foo/bar contains an entry for c
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    # foo contains an entry for a, b, and bar
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )
    # The root contains an entry for foo
    root_listing = serializer.generate_listing(
        directories=[
            ZarrChecksum(path='foo', md5=foo_listing.md5),
        ]
    )

    assert ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing

    # Test remove_checksums by removing the deepest file, c
    ZarrChecksumUpdater(zarr_archive).remove_checksums(['foo/bar/c'])
    # There should now only be two checksum files: foo and the top level
    # foo no longer contains bar
    foo_listing = serializer.generate_listing(files=[a.to_checksum(), b.to_checksum()])
    # The root needs to be updated, since foo's checksum has changed
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)]
    )

    assert not c.blob.storage.exists(
        ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').checksum_file_path
    )
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing
def test_ingest_zarr_archive_existing(zarr_upload_file_factory,
                                      zarr_archive_factory):
    zarr: ZarrArchive = zarr_archive_factory()

    # Add initial files
    a = zarr_upload_file_factory(zarr_archive=zarr, path='foo/a')
    b = zarr_upload_file_factory(zarr_archive=zarr, path='foo/b')
    c = zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/c')

    # Intentionally generate and save invalid checksum files
    ZarrChecksumUpdater(zarr).update_file_checksums([
        ZarrChecksum(path='foo/a', md5='a'),
        ZarrChecksum(path='foo/b', md5='b'),
        ZarrChecksum(path='foo/bar/c', md5='c'),
    ], )

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )

    # The root contains an entry for foo
    root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo', md5=foo_listing.md5),
    ])

    # Assert checksum files exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is not None

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing
def test_ingest_dandiset_zarrs(dandiset_factory, zarr_archive_factory,
                               zarr_upload_file_factory):
    dandiset: Dandiset = dandiset_factory()
    for _ in range(10):
        zarr_upload_file_factory(
            path='foo/a',
            zarr_archive=zarr_archive_factory(dandiset=dandiset),
        )

    # Run ingest
    ingest_dandiset_zarrs(str(dandiset.identifier))

    # Assert that zarr archives have been updated
    dandiset.refresh_from_db()
    for zarr in dandiset.zarr_archives.all():
        assert zarr.size != 0
        assert zarr.file_count != 0
        assert ZarrChecksumFileUpdater(zarr,
                                       '').read_checksum_file() is not None
def test_ingest_zarr_archive_assets(zarr_upload_file_factory,
                                    zarr_archive_factory, asset_factory):
    # Create zarr and asset
    zarr: ZarrArchive = zarr_archive_factory()
    zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/a')
    asset = asset_factory(zarr=zarr, blob=None, embargoed_blob=None)

    # Assert asset size, metadata
    assert asset.size == 0
    assert asset.metadata['contentSize'] == 0
    assert asset.metadata['digest'][
        'dandi:dandi-zarr-checksum'] == EMPTY_CHECKSUM

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert asset size, metadata
    asset.refresh_from_db()
    assert asset.size == 100
    assert asset.metadata['contentSize'] == 100
    assert (asset.metadata['digest']['dandi:dandi-zarr-checksum'] ==
            ZarrChecksumFileUpdater(asset.zarr, '').read_checksum_file().md5)
Beispiel #12
0
def test_zarr_checksum_file_updater_write_checksum_file(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    listing = ZarrChecksumListing(
        checksums=ZarrChecksums(
            files=[
                upload.to_checksum(),
            ]
        ),
        md5='b',
    )

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.write_checksum_file(listing)

    storage = upload.blob.storage
    assert storage.exists(updater.checksum_file_path)
    with storage.open(updater.checksum_file_path) as f:
        contents = f.read()
        assert ZarrJSONChecksumSerializer().deserialize(contents) == listing

    # Now that the checksum file has been written, verify that it can be updated
    second_upload = zarr_upload_file_factory(zarr_archive=zarr_archive, path=f'{upload.path}2')
    listing.checksums.files.append(second_upload.to_checksum())

    updater.write_checksum_file(listing)

    assert storage.exists(updater.checksum_file_path)
    with storage.open(updater.checksum_file_path) as f:
        contents = f.read()
        assert ZarrJSONChecksumSerializer().deserialize(contents) == listing
def test_ingest_zarr_archive(zarr_upload_file_factory, zarr_archive_factory,
                             faker):
    zarr: ZarrArchive = zarr_archive_factory()

    # Generate > 1000 files, since the page size from S3 is 1000 items
    foo_bar_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/a'),
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/b'),
    ]
    foo_baz_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr,
                                 path=f'foo/baz/{faker.pystr()}')
        for _ in range(1005)
    ]

    # Calculate size and file count
    total_size = sum([f.blob.size for f in (foo_bar_files + foo_baz_files)])
    total_file_count = len(foo_bar_files) + len(foo_baz_files)

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_bar_files])
    foo_baz_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_baz_files])
    foo_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ZarrChecksum(path='foo/baz', md5=foo_baz_listing.md5),
    ])
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)])

    # Assert checksum files don't already exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/baz').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is None

    # Assert zarr size and file count is zero
    assert zarr.size == 0
    assert zarr.file_count == 0

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/baz').read_checksum_file() == foo_baz_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing

    # Assert size and file count matches
    zarr.refresh_from_db()
    assert zarr.size == total_size
    assert zarr.file_count == total_file_count
Beispiel #14
0
def test_zarr_checksum_file_updater_context_manager(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    serializer = ZarrJSONChecksumSerializer()
    checksums = [upload.to_checksum()]
    listing = serializer.generate_listing(files=checksums)

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.write_checksum_file(listing)

    with ZarrChecksumFileUpdater(zarr_archive, upload_parent_path) as updater:
        # Initializing the context manager loads the checksum file
        assert updater.checksum_listing == listing

        # Add two new checksums to the updater
        a = ZarrChecksum(path='foo/bar', md5='a')
        b = ZarrChecksum(path='foo/baz', md5='b')
        # Duplicate checksums should be removed
        updater.add_file_checksums(sorted([upload.to_checksum(), a, a, b, b]))
        # The updater's internal state should be updated
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [a, b])
        )

        # Remove the A checksum from the updater
        # The md5 should not need to match
        updater.remove_checksums([a.path])
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [b])
        )

        # The file should not yet be updated with our changes
        assert updater.read_checksum_file() == serializer.generate_listing(files=sorted(checksums))

    # Exiting the context should write the checksum file
    assert updater.read_checksum_file() == serializer.generate_listing(
        files=sorted(checksums + [b])
    )