def test_zarr_checksum_updater(storage, zarr_archive: ZarrArchive, zarr_upload_file_factory):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    a: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/a')
    b: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/b')
    c: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/bar/c')

    # Test update_checksums
    ZarrChecksumUpdater(zarr_archive).update_file_checksums(
        [a.to_checksum(), b.to_checksum(), c.to_checksum()],
    )

    serializer = ZarrJSONChecksumSerializer()
    # There should be 3 checksum files generated: foo/bar, foo, and the top level
    # foo/bar contains an entry for c
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    # foo contains an entry for a, b, and bar
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )
    # The root contains an entry for foo
    root_listing = serializer.generate_listing(
        directories=[
            ZarrChecksum(path='foo', md5=foo_listing.md5),
        ]
    )

    assert ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing

    # Test remove_checksums by removing the deepest file, c
    ZarrChecksumUpdater(zarr_archive).remove_checksums(['foo/bar/c'])
    # There should now only be two checksum files: foo and the top level
    # foo no longer contains bar
    foo_listing = serializer.generate_listing(files=[a.to_checksum(), b.to_checksum()])
    # The root needs to be updated, since foo's checksum has changed
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)]
    )

    assert not c.blob.storage.exists(
        ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').checksum_file_path
    )
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing
def test_ingest_zarr_archive_existing(zarr_upload_file_factory,
                                      zarr_archive_factory):
    zarr: ZarrArchive = zarr_archive_factory()

    # Add initial files
    a = zarr_upload_file_factory(zarr_archive=zarr, path='foo/a')
    b = zarr_upload_file_factory(zarr_archive=zarr, path='foo/b')
    c = zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/c')

    # Intentionally generate and save invalid checksum files
    ZarrChecksumUpdater(zarr).update_file_checksums([
        ZarrChecksum(path='foo/a', md5='a'),
        ZarrChecksum(path='foo/b', md5='b'),
        ZarrChecksum(path='foo/bar/c', md5='c'),
    ], )

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )

    # The root contains an entry for foo
    root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo', md5=foo_listing.md5),
    ])

    # Assert checksum files exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is not None

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing
def test_zarr_checksum_serializer_generate_listing():
    serializer = ZarrJSONChecksumSerializer()
    checksums = ZarrChecksums(
        files=[ZarrChecksum(path='foo/bar', md5='a')],
        directories=[ZarrChecksum(path='foo/baz', md5='b')],
    )
    assert serializer.generate_listing(checksums) == ZarrChecksumListing(
        checksums=checksums, md5='23076057c0da63f8ab50d0a108db332c'
    )
def test_zarr_checksum_file_updater_context_manager(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    serializer = ZarrJSONChecksumSerializer()
    checksums = [upload.to_checksum()]
    listing = serializer.generate_listing(files=checksums)

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.write_checksum_file(listing)

    with ZarrChecksumFileUpdater(zarr_archive, upload_parent_path) as updater:
        # Initializing the context manager loads the checksum file
        assert updater.checksum_listing == listing

        # Add two new checksums to the updater
        a = ZarrChecksum(path='foo/bar', md5='a')
        b = ZarrChecksum(path='foo/baz', md5='b')
        # Duplicate checksums should be removed
        updater.add_file_checksums(sorted([upload.to_checksum(), a, a, b, b]))
        # The updater's internal state should be updated
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [a, b])
        )

        # Remove the A checksum from the updater
        # The md5 should not need to match
        updater.remove_checksums([a.path])
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [b])
        )

        # The file should not yet be updated with our changes
        assert updater.read_checksum_file() == serializer.generate_listing(files=sorted(checksums))

    # Exiting the context should write the checksum file
    assert updater.read_checksum_file() == serializer.generate_listing(
        files=sorted(checksums + [b])
    )
Exemple #5
0
def test_zarr_rest_upload_complete(
    authenticated_api_client,
    user,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    assign_perm('owner', user, zarr_archive.dandiset)
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    # Creating a zarr upload file means that the zarr has an upload in progress
    upload: ZarrUploadFile = zarr_upload_file_factory(
        zarr_archive=zarr_archive)
    assert zarr_archive.upload_in_progress
    assert zarr_archive.checksum == EMPTY_CHECKSUM

    resp = authenticated_api_client.post(
        f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/')
    assert resp.status_code == 201

    # Completing the upload means that it is no longer in progress
    assert not zarr_archive.upload_in_progress

    # zarr_upload_file_factory always generates paths in the form foo/bar.nwb
    parent_path = Path(upload.path).parent
    root_path = parent_path.parent

    # Verify the parent directory checksum file is correct
    serializer = ZarrJSONChecksumSerializer()
    expected_parent_listing = serializer.generate_listing(
        files=[upload.to_checksum()])
    assert (ZarrChecksumFileUpdater(
        zarr_archive,
        parent_path).read_checksum_file() == expected_parent_listing)
    # Verify that the root directory checksum file is correct
    expected_root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5)
    ])
    assert (ZarrChecksumFileUpdater(
        zarr_archive, root_path).read_checksum_file() == expected_root_listing)
    assert zarr_archive.checksum == expected_root_listing.md5
Exemple #6
0
def test_zarr_rest_upload_complete_asset_metadata(
    authenticated_api_client,
    user,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
    asset_factory,
):
    assign_perm('owner', user, zarr_archive.dandiset)
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    # Creating a zarr upload file means that the zarr has an upload in progress
    upload: ZarrUploadFile = zarr_upload_file_factory(
        zarr_archive=zarr_archive)
    assert zarr_archive.upload_in_progress
    assert zarr_archive.checksum == EMPTY_CHECKSUM
    asset = asset_factory(zarr=zarr_archive, blob=None)
    assert asset.metadata['digest'][
        'dandi:dandi-zarr-checksum'] == EMPTY_CHECKSUM
    assert asset.metadata['contentSize'] == 0

    resp = authenticated_api_client.post(
        f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/')
    assert resp.status_code == 201

    # Calculate the new checksum
    parent_path = Path(upload.path).parent
    serializer = ZarrJSONChecksumSerializer()
    expected_parent_listing = serializer.generate_listing(
        files=[upload.to_checksum()])
    expected_root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5)
    ])
    # Verify that the asset metadata was updated when the upload completed
    asset.refresh_from_db()
    assert asset.metadata['digest'][
        'dandi:dandi-zarr-checksum'] == expected_root_listing.md5
    assert asset.metadata['contentSize'] == 100
def test_ingest_zarr_archive(zarr_upload_file_factory, zarr_archive_factory,
                             faker):
    zarr: ZarrArchive = zarr_archive_factory()

    # Generate > 1000 files, since the page size from S3 is 1000 items
    foo_bar_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/a'),
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/b'),
    ]
    foo_baz_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr,
                                 path=f'foo/baz/{faker.pystr()}')
        for _ in range(1005)
    ]

    # Calculate size and file count
    total_size = sum([f.blob.size for f in (foo_bar_files + foo_baz_files)])
    total_file_count = len(foo_bar_files) + len(foo_baz_files)

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_bar_files])
    foo_baz_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_baz_files])
    foo_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ZarrChecksum(path='foo/baz', md5=foo_baz_listing.md5),
    ])
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)])

    # Assert checksum files don't already exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/baz').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is None

    # Assert zarr size and file count is zero
    assert zarr.size == 0
    assert zarr.file_count == 0

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/baz').read_checksum_file() == foo_baz_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing

    # Assert size and file count matches
    zarr.refresh_from_db()
    assert zarr.size == total_size
    assert zarr.file_count == total_file_count