def test_zarr_checksum_serializer_generate_listing(): serializer = ZarrJSONChecksumSerializer() checksums = ZarrChecksums( files=[ZarrChecksum(path='foo/bar', md5='a')], directories=[ZarrChecksum(path='foo/baz', md5='b')], ) assert serializer.generate_listing(checksums) == ZarrChecksumListing( checksums=checksums, md5='23076057c0da63f8ab50d0a108db332c' )
def test_zarr_deserialize(): serializer = ZarrJSONChecksumSerializer() assert serializer.deserialize( '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}' # noqa: E501 ) == ZarrChecksumListing( checksums=ZarrChecksums( files=[ZarrChecksum(path='foo/bar', md5='a')], directories=[ZarrChecksum(path='bar/foo', md5='b')], ), md5='c', )
def test_zarr_checksum_updater(storage, zarr_archive: ZarrArchive, zarr_upload_file_factory): # Pretend like ZarrUploadFile was defined with the given storage ZarrUploadFile.blob.field.storage = storage a: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/a') b: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/b') c: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive, path='foo/bar/c') # Test update_checksums ZarrChecksumUpdater(zarr_archive).update_file_checksums( [a.to_checksum(), b.to_checksum(), c.to_checksum()], ) serializer = ZarrJSONChecksumSerializer() # There should be 3 checksum files generated: foo/bar, foo, and the top level # foo/bar contains an entry for c foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()]) # foo contains an entry for a, b, and bar foo_listing = serializer.generate_listing( files=[ a.to_checksum(), b.to_checksum(), ], directories=[ ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5), ], ) # The root contains an entry for foo root_listing = serializer.generate_listing( directories=[ ZarrChecksum(path='foo', md5=foo_listing.md5), ] ) assert ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').read_checksum_file() == foo_bar_listing assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing # Test remove_checksums by removing the deepest file, c ZarrChecksumUpdater(zarr_archive).remove_checksums(['foo/bar/c']) # There should now only be two checksum files: foo and the top level # foo no longer contains bar foo_listing = serializer.generate_listing(files=[a.to_checksum(), b.to_checksum()]) # The root needs to be updated, since foo's checksum has changed root_listing = serializer.generate_listing( directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)] ) assert not c.blob.storage.exists( ZarrChecksumFileUpdater(zarr_archive, 'foo/bar').checksum_file_path ) assert ZarrChecksumFileUpdater(zarr_archive, 'foo').read_checksum_file() == foo_listing assert ZarrChecksumFileUpdater(zarr_archive, '').read_checksum_file() == root_listing
def test_ingest_zarr_archive_existing(zarr_upload_file_factory, zarr_archive_factory): zarr: ZarrArchive = zarr_archive_factory() # Add initial files a = zarr_upload_file_factory(zarr_archive=zarr, path='foo/a') b = zarr_upload_file_factory(zarr_archive=zarr, path='foo/b') c = zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/c') # Intentionally generate and save invalid checksum files ZarrChecksumUpdater(zarr).update_file_checksums([ ZarrChecksum(path='foo/a', md5='a'), ZarrChecksum(path='foo/b', md5='b'), ZarrChecksum(path='foo/bar/c', md5='c'), ], ) # Generate correct listings serializer = ZarrJSONChecksumSerializer() foo_bar_listing = serializer.generate_listing(files=[c.to_checksum()]) foo_listing = serializer.generate_listing( files=[ a.to_checksum(), b.to_checksum(), ], directories=[ ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5), ], ) # The root contains an entry for foo root_listing = serializer.generate_listing(directories=[ ZarrChecksum(path='foo', md5=foo_listing.md5), ]) # Assert checksum files exist assert ZarrChecksumFileUpdater(zarr, 'foo/bar').read_checksum_file() is not None assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() is not None assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is not None # Compute checksum ingest_zarr_archive(str(zarr.zarr_id)) # Assert files computed correctly assert ZarrChecksumFileUpdater( zarr, 'foo/bar').read_checksum_file() == foo_bar_listing assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() == foo_listing assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() == root_listing
def test_zarr_checksum_file_updater_context_manager( storage, zarr_archive: ZarrArchive, zarr_upload_file_factory, ): # Pretend like ZarrUploadFile was defined with the given storage ZarrUploadFile.blob.field.storage = storage upload: ZarrUploadFile = zarr_upload_file_factory(zarr_archive=zarr_archive) upload_parent_path = str(Path(upload.path).parent) serializer = ZarrJSONChecksumSerializer() checksums = [upload.to_checksum()] listing = serializer.generate_listing(files=checksums) updater = ZarrChecksumFileUpdater(zarr_archive, upload_parent_path) updater.write_checksum_file(listing) with ZarrChecksumFileUpdater(zarr_archive, upload_parent_path) as updater: # Initializing the context manager loads the checksum file assert updater.checksum_listing == listing # Add two new checksums to the updater a = ZarrChecksum(path='foo/bar', md5='a') b = ZarrChecksum(path='foo/baz', md5='b') # Duplicate checksums should be removed updater.add_file_checksums(sorted([upload.to_checksum(), a, a, b, b])) # The updater's internal state should be updated assert updater.checksum_listing == serializer.generate_listing( files=sorted(checksums + [a, b]) ) # Remove the A checksum from the updater # The md5 should not need to match updater.remove_checksums([a.path]) assert updater.checksum_listing == serializer.generate_listing( files=sorted(checksums + [b]) ) # The file should not yet be updated with our changes assert updater.read_checksum_file() == serializer.generate_listing(files=sorted(checksums)) # Exiting the context should write the checksum file assert updater.read_checksum_file() == serializer.generate_listing( files=sorted(checksums + [b]) )
def test_zarr_rest_upload_complete( authenticated_api_client, user, storage, zarr_archive: ZarrArchive, zarr_upload_file_factory, ): assign_perm('owner', user, zarr_archive.dandiset) # Pretend like ZarrUploadFile was defined with the given storage ZarrUploadFile.blob.field.storage = storage # Creating a zarr upload file means that the zarr has an upload in progress upload: ZarrUploadFile = zarr_upload_file_factory( zarr_archive=zarr_archive) assert zarr_archive.upload_in_progress assert zarr_archive.checksum == EMPTY_CHECKSUM resp = authenticated_api_client.post( f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/') assert resp.status_code == 201 # Completing the upload means that it is no longer in progress assert not zarr_archive.upload_in_progress # zarr_upload_file_factory always generates paths in the form foo/bar.nwb parent_path = Path(upload.path).parent root_path = parent_path.parent # Verify the parent directory checksum file is correct serializer = ZarrJSONChecksumSerializer() expected_parent_listing = serializer.generate_listing( files=[upload.to_checksum()]) assert (ZarrChecksumFileUpdater( zarr_archive, parent_path).read_checksum_file() == expected_parent_listing) # Verify that the root directory checksum file is correct expected_root_listing = serializer.generate_listing(directories=[ ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5) ]) assert (ZarrChecksumFileUpdater( zarr_archive, root_path).read_checksum_file() == expected_root_listing) assert zarr_archive.checksum == expected_root_listing.md5
def ingest_zarr_archive( zarr_id: str, no_checksum: bool = False, no_size: bool = False, no_count: bool = False ): client = get_client() zarr: ZarrArchive = ZarrArchive.objects.select_for_update().get(zarr_id=zarr_id) # Reset before compute if not no_size: zarr.size = 0 if not no_count: zarr.file_count = 0 # Instantiate updater and add files as they come in updater = SessionZarrChecksumUpdater(zarr_archive=zarr) for files in yield_files(client, zarr): # Update size and file count if not no_size: zarr.size += sum((file['Size'] for file in files)) if not no_count: zarr.file_count += len(files) # Update checksums if not no_checksum: updater.update_file_checksums( [ ZarrChecksum( md5=file['ETag'].strip('"'), path=file['Key'].replace(zarr.s3_path(''), ''), ) for file in files ] ) # Save zarr after completion zarr.save() # Save all assets that reference this zarr, so their metadata is updated for asset in zarr.assets.all(): asset.save()
def test_zarr_rest_upload_complete_asset_metadata( authenticated_api_client, user, storage, zarr_archive: ZarrArchive, zarr_upload_file_factory, asset_factory, ): assign_perm('owner', user, zarr_archive.dandiset) # Pretend like ZarrUploadFile was defined with the given storage ZarrUploadFile.blob.field.storage = storage # Creating a zarr upload file means that the zarr has an upload in progress upload: ZarrUploadFile = zarr_upload_file_factory( zarr_archive=zarr_archive) assert zarr_archive.upload_in_progress assert zarr_archive.checksum == EMPTY_CHECKSUM asset = asset_factory(zarr=zarr_archive, blob=None) assert asset.metadata['digest'][ 'dandi:dandi-zarr-checksum'] == EMPTY_CHECKSUM assert asset.metadata['contentSize'] == 0 resp = authenticated_api_client.post( f'/api/zarr/{zarr_archive.zarr_id}/upload/complete/') assert resp.status_code == 201 # Calculate the new checksum parent_path = Path(upload.path).parent serializer = ZarrJSONChecksumSerializer() expected_parent_listing = serializer.generate_listing( files=[upload.to_checksum()]) expected_root_listing = serializer.generate_listing(directories=[ ZarrChecksum(path=str(parent_path), md5=expected_parent_listing.md5) ]) # Verify that the asset metadata was updated when the upload completed asset.refresh_from_db() assert asset.metadata['digest'][ 'dandi:dandi-zarr-checksum'] == expected_root_listing.md5 assert asset.metadata['contentSize'] == 100
def test_ingest_zarr_archive(zarr_upload_file_factory, zarr_archive_factory, faker): zarr: ZarrArchive = zarr_archive_factory() # Generate > 1000 files, since the page size from S3 is 1000 items foo_bar_files: List[ZarrUploadFile] = [ zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/a'), zarr_upload_file_factory(zarr_archive=zarr, path='foo/bar/b'), ] foo_baz_files: List[ZarrUploadFile] = [ zarr_upload_file_factory(zarr_archive=zarr, path=f'foo/baz/{faker.pystr()}') for _ in range(1005) ] # Calculate size and file count total_size = sum([f.blob.size for f in (foo_bar_files + foo_baz_files)]) total_file_count = len(foo_bar_files) + len(foo_baz_files) # Generate correct listings serializer = ZarrJSONChecksumSerializer() foo_bar_listing = serializer.generate_listing( files=[f.to_checksum() for f in foo_bar_files]) foo_baz_listing = serializer.generate_listing( files=[f.to_checksum() for f in foo_baz_files]) foo_listing = serializer.generate_listing(directories=[ ZarrChecksum(path='foo/bar', md5=foo_bar_listing.md5), ZarrChecksum(path='foo/baz', md5=foo_baz_listing.md5), ]) root_listing = serializer.generate_listing( directories=[ZarrChecksum(path='foo', md5=foo_listing.md5)]) # Assert checksum files don't already exist assert ZarrChecksumFileUpdater(zarr, 'foo/bar').read_checksum_file() is None assert ZarrChecksumFileUpdater(zarr, 'foo/baz').read_checksum_file() is None assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() is None assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() is None # Assert zarr size and file count is zero assert zarr.size == 0 assert zarr.file_count == 0 # Compute checksum ingest_zarr_archive(str(zarr.zarr_id)) # Assert files computed correctly assert ZarrChecksumFileUpdater( zarr, 'foo/bar').read_checksum_file() == foo_bar_listing assert ZarrChecksumFileUpdater( zarr, 'foo/baz').read_checksum_file() == foo_baz_listing assert ZarrChecksumFileUpdater(zarr, 'foo').read_checksum_file() == foo_listing assert ZarrChecksumFileUpdater(zarr, '').read_checksum_file() == root_listing # Assert size and file count matches zarr.refresh_from_db() assert zarr.size == total_size assert zarr.file_count == total_file_count
def test_zarr_checksum_sort_order(): # The a < b in the path should take precedence over z > y in the md5 a = ZarrChecksum(path='1/2/3/a/z', md5='z') b = ZarrChecksum(path='1/2/3/b/z', md5='y') assert sorted([b, a]) == [a, b]
def test_zarr_checksum_sort_order(): # The a < b in the path should take precedence over z > y in the md5 a = ZarrChecksum(path='1/2/3/a/z', md5='z') b = ZarrChecksum(path='1/2/3/b/z', md5='y') assert sorted([b, a]) == [a, b] # ZarrJSONChecksumSerializer tests @pytest.mark.parametrize( 'file_checksums,directory_checksums,checksum', [ ([], [], '481a2f77ab786a0f45aafd5db0971caa'), ([ZarrChecksum(path='foo/bar', md5='a')], [], 'cdcfdfca3622e20df03219273872549e'), ([], [ZarrChecksum(path='foo/bar', md5='a')], '243aca82c6872222747183dd738b6fcb'), ( [ZarrChecksum(path='foo/bar', md5='a'), ZarrChecksum(path='foo/baz', md5='b')], [], '785295076ae9156b363e442ef6d485e0', ), ( [], [ZarrChecksum(path='foo/bar', md5='a'), ZarrChecksum(path='foo/baz', md5='b')], 'ebca8bb8e716237e0f71657d1045930f', ), ( [ZarrChecksum(path='foo/baz', md5='a')], [ZarrChecksum(path='foo/bar', md5='b')], '9c34644ba03b7e9f58ebd1caef4215ad',
def to_checksum(self) -> ZarrChecksum: return ZarrChecksum(path=self.path, md5=self.etag)