def test_zarr_checksum_serializer_generate_listing():
    serializer = ZarrJSONChecksumSerializer()
    checksums = ZarrChecksums(
        files=[ZarrChecksum(path='foo/bar', md5='a')],
        directories=[ZarrChecksum(path='foo/baz', md5='b')],
    )
    assert serializer.generate_listing(checksums) == ZarrChecksumListing(
        checksums=checksums, md5='23076057c0da63f8ab50d0a108db332c'
    )
def test_zarr_deserialize():
    serializer = ZarrJSONChecksumSerializer()
    assert serializer.deserialize(
        '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}'  # noqa: E501
    ) == ZarrChecksumListing(
        checksums=ZarrChecksums(
            files=[ZarrChecksum(path='foo/bar', md5='a')],
            directories=[ZarrChecksum(path='bar/foo', md5='b')],
        ),
        md5='c',
    )
def test_zarr_checksum_updater(storage, zarr_archive: ZarrArchive, zarr_upload_file_factory):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    a: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/a')
    b: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/b')
    c: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/bar/c')

    # Test update_checksums
    ZarrChecksumUpdater(zarr_archive).update_file_checksums(
        [a.to_checksum(), b.to_checksum(), c.to_checksum()],
    )

    serializer = ZarrJSONChecksumSerializer()
    # There should be 3 checksum files generated: foo/bar, foo, and the top level
    # foo/bar contains an entry for c
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    # foo contains an entry for a, b, and bar
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )
    # The root contains an entry for foo
    root_listing = serializer.generate_listing(
        directories=[
            ZarrChecksum(path='foo', md5=foo_listing.md5),
        ]
    )

    assert ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing

    # Test remove_checksums by removing the deepest file, c
    ZarrChecksumUpdater(zarr_archive).remove_checksums(['foo/bar/c'])
    # There should now only be two checksum files: foo and the top level
    # foo no longer contains bar
    foo_listing = serializer.generate_listing(files=[a.to_checksum(), b.to_checksum()])
    # The root needs to be updated, since foo's checksum has changed
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)]
    )

    assert not c.blob.storage.exists(
        ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').checksum_file_path
    )
    assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing
def test_ingest_zarr_archive_existing(zarr_upload_file_factory,
                                      zarr_archive_factory):
    zarr: ZarrArchive = zarr_archive_factory()

    # Add initial files
    a = zarr_upload_file_factory(zarr_archive=zarr, path='foo/a')
    b = zarr_upload_file_factory(zarr_archive=zarr, path='foo/b')
    c = zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/c')

    # Intentionally generate and save invalid checksum files
    ZarrChecksumUpdater(zarr).update_file_checksums([
        ZarrChecksum(path='foo/a', md5='a'),
        ZarrChecksum(path='foo/b', md5='b'),
        ZarrChecksum(path='foo/bar/c', md5='c'),
    ], )

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()])
    foo_listing = serializer.generate_listing(
        files=[
            a.to_checksum(),
            b.to_checksum(),
        ],
        directories=[
            ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ],
    )

    # The root contains an entry for foo
    root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo', md5=foo_listing.md5),
    ])

    # Assert checksum files exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() is not None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is not None

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing
def test_zarr_checksum_file_updater_context_manager(
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive)
    upload_parent_path = str(Path(upload.path).parent)
    serializer = ZarrJSONChecksumSerializer()
    checksums = [upload.to_checksum()]
    listing = serializer.generate_listing(files=checksums)

    updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path)
    updater.write_checksum_file(listing)

    with ZarrChecksumFileUpdater(zarr_archive, upload_parent_path) as updater:
        # Initializing the context manager loads the checksum file
        assert updater.checksum_listing == listing

        # Add two new checksums to the updater
        a = ZarrChecksum(path='foo/bar', md5='a')
        b = ZarrChecksum(path='foo/baz', md5='b')
        # Duplicate checksums should be removed
        updater.add_file_checksums(sorted([upload.to_checksum(), a, a, b, b]))
        # The updater's internal state should be updated
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [a, b])
        )

        # Remove the A checksum from the updater
        # The md5 should not need to match
        updater.remove_checksums([a.path])
        assert updater.checksum_listing == serializer.generate_listing(
            files=sorted(checksums + [b])
        )

        # The file should not yet be updated with our changes
        assert updater.read_checksum_file() == serializer.generate_listing(files=sorted(checksums))

    # Exiting the context should write the checksum file
    assert updater.read_checksum_file() == serializer.generate_listing(
        files=sorted(checksums + [b])
    )
Exemple #6
0
def test_zarr_rest_upload_complete(
    authenticated_api_client,
    user,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
):
    assign_perm('owner', user, zarr_archive.dandiset)
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    # Creating a zarr upload file means that the zarr has an upload in progress
    upload: ZarrUploadFile = zarr_upload_file_factory(
        zarr_archive=zarr_archive)
    assert zarr_archive.upload_in_progress
    assert zarr_archive.checksum == EMPTY_CHECKSUM

    resp = authenticated_api_client.post(
        f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/')
    assert resp.status_code == 201

    # Completing the upload means that it is no longer in progress
    assert not zarr_archive.upload_in_progress

    # zarr_upload_file_factory always generates paths in the form foo/bar.nwb
    parent_path = Path(upload.path).parent
    root_path = parent_path.parent

    # Verify the parent directory checksum file is correct
    serializer = ZarrJSONChecksumSerializer()
    expected_parent_listing = serializer.generate_listing(
        files=[upload.to_checksum()])
    assert (ZarrChecksumFileUpdater(
        zarr_archive,
        parent_path).read_checksum_file() == expected_parent_listing)
    # Verify that the root directory checksum file is correct
    expected_root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5)
    ])
    assert (ZarrChecksumFileUpdater(
        zarr_archive, root_path).read_checksum_file() == expected_root_listing)
    assert zarr_archive.checksum == expected_root_listing.md5
Exemple #7
0
def ingest_zarr_archive(
    zarr_id: str, no_checksum: bool = False, no_size: bool = False, no_count: bool = False
):
    client = get_client()
    zarr: ZarrArchive = ZarrArchive.objects.select_for_update().get(zarr_id=zarr_id)

    # Reset before compute
    if not no_size:
        zarr.size = 0
    if not no_count:
        zarr.file_count = 0

    # Instantiate updater and add files as they come in
    updater = SessionZarrChecksumUpdater(zarr_archive=zarr)
    for files in yield_files(client, zarr):
        # Update size and file count
        if not no_size:
            zarr.size += sum((file['Size'] for file in files))
        if not no_count:
            zarr.file_count += len(files)

        # Update checksums
        if not no_checksum:
            updater.update_file_checksums(
                [
                    ZarrChecksum(
                        md5=file['ETag'].strip('"'),
                        path=file['Key'].replace(zarr.s3_path(''), ''),
                    )
                    for file in files
                ]
            )

    # Save zarr after completion
    zarr.save()

    # Save all assets that reference this zarr, so their metadata is updated
    for asset in zarr.assets.all():
        asset.save()
Exemple #8
0
def test_zarr_rest_upload_complete_asset_metadata(
    authenticated_api_client,
    user,
    storage,
    zarr_archive: ZarrArchive,
    zarr_upload_file_factory,
    asset_factory,
):
    assign_perm('owner', user, zarr_archive.dandiset)
    # Pretend like ZarrUploadFile was defined with the given storage
    ZarrUploadFile.blob.field.storage = storage
    # Creating a zarr upload file means that the zarr has an upload in progress
    upload: ZarrUploadFile = zarr_upload_file_factory(
        zarr_archive=zarr_archive)
    assert zarr_archive.upload_in_progress
    assert zarr_archive.checksum == EMPTY_CHECKSUM
    asset = asset_factory(zarr=zarr_archive, blob=None)
    assert asset.metadata['digest'][
        'dandi:dandi-zarr-checksum'] == EMPTY_CHECKSUM
    assert asset.metadata['contentSize'] == 0

    resp = authenticated_api_client.post(
        f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/')
    assert resp.status_code == 201

    # Calculate the new checksum
    parent_path = Path(upload.path).parent
    serializer = ZarrJSONChecksumSerializer()
    expected_parent_listing = serializer.generate_listing(
        files=[upload.to_checksum()])
    expected_root_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5)
    ])
    # Verify that the asset metadata was updated when the upload completed
    asset.refresh_from_db()
    assert asset.metadata['digest'][
        'dandi:dandi-zarr-checksum'] == expected_root_listing.md5
    assert asset.metadata['contentSize'] == 100
def test_ingest_zarr_archive(zarr_upload_file_factory, zarr_archive_factory,
                             faker):
    zarr: ZarrArchive = zarr_archive_factory()

    # Generate > 1000 files, since the page size from S3 is 1000 items
    foo_bar_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/a'),
        zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/b'),
    ]
    foo_baz_files: List[ZarrUploadFile] = [
        zarr_upload_file_factory(zarr_archive=zarr,
                                 path=f'foo/baz/{faker.pystr()}')
        for _ in range(1005)
    ]

    # Calculate size and file count
    total_size = sum([f.blob.size for f in (foo_bar_files + foo_baz_files)])
    total_file_count = len(foo_bar_files) + len(foo_baz_files)

    # Generate correct listings
    serializer = ZarrJSONChecksumSerializer()
    foo_bar_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_bar_files])
    foo_baz_listing = serializer.generate_listing(
        files=[f.to_checksum() for f in foo_baz_files])
    foo_listing = serializer.generate_listing(directories=[
        ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5),
        ZarrChecksum(path='foo/baz', md5=foo_baz_listing.md5),
    ])
    root_listing = serializer.generate_listing(
        directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)])

    # Assert checksum files don't already exist
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/bar').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo/baz').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() is None
    assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is None

    # Assert zarr size and file count is zero
    assert zarr.size == 0
    assert zarr.file_count == 0

    # Compute checksum
    ingest_zarr_archive(str(zarr.zarr_id))

    # Assert files computed correctly
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/bar').read_checksum_file() == foo_bar_listing
    assert ZarrChecksumFileUpdater(
        zarr, 'foo/baz').read_checksum_file() == foo_baz_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   'foo').read_checksum_file() == foo_listing
    assert ZarrChecksumFileUpdater(zarr,
                                   '').read_checksum_file() == root_listing

    # Assert size and file count matches
    zarr.refresh_from_db()
    assert zarr.size == total_size
    assert zarr.file_count == total_file_count
def test_zarr_checksum_sort_order():
    # The a < b in the path should take precedence over z > y in the md5
    a = ZarrChecksum(path='1/2/3/a/z', md5='z')
    b = ZarrChecksum(path='1/2/3/b/z', md5='y')
    assert sorted([b, a]) == [a, b]
def test_zarr_checksum_sort_order():
    # The a < b in the path should take precedence over z > y in the md5
    a = ZarrChecksum(path='1/2/3/a/z', md5='z')
    b = ZarrChecksum(path='1/2/3/b/z', md5='y')
    assert sorted([b, a]) == [a, b]


# ZarrJSONChecksumSerializer tests


@pytest.mark.parametrize(
    'file_checksums,directory_checksums,checksum',
    [
        ([], [], '481a2f77ab786a0f45aafd5db0971caa'),
        ([ZarrChecksum(path='foo/bar', md5='a')], [], 'cdcfdfca3622e20df03219273872549e'),
        ([], [ZarrChecksum(path='foo/bar', md5='a')], '243aca82c6872222747183dd738b6fcb'),
        (
            [ZarrChecksum(path='foo/bar', md5='a'), ZarrChecksum(path='foo/baz', md5='b')],
            [],
            '785295076ae9156b363e442ef6d485e0',
        ),
        (
            [],
            [ZarrChecksum(path='foo/bar', md5='a'), ZarrChecksum(path='foo/baz', md5='b')],
            'ebca8bb8e716237e0f71657d1045930f',
        ),
        (
            [ZarrChecksum(path='foo/baz', md5='a')],
            [ZarrChecksum(path='foo/bar', md5='b')],
            '9c34644ba03b7e9f58ebd1caef4215ad',
Exemple #12
0
 def to_checksum(self) -> ZarrChecksum:
     return ZarrChecksum(path=self.path, md5=self.etag)