def setUpClass(cls): super().setUpClass() pkg = Package() pkg._set_commit_message(cls.parent_commit_message) pkg._workflow = { 'config': f's3://{cls.parent_bucket}/.quilt/workflows/config.yml?versionId=configVersion', 'id': 'gamma', 'schemas': { 'top-secret': f's3://{cls.parent_bucket}/top-secret.schema.json?versionId=schemaVersion' }, } pkg.set_meta({'meta': 'old meta'}) cls.entries = cls.get_pkg_entries() for lk, entry in cls.entries.items(): pkg.set(lk, entry) manifest_buf = io.BytesIO() pkg._dump(manifest_buf) cls.parent_manifest = manifest_buf.getvalue() cls.parent_top_hash = pkg.top_hash cls.src_params = { 'parent': { 'registry': cls.src_registry, 'name': cls.parent_pkg_name, 'top_hash': cls.parent_top_hash, }, }
def prepare_pkg(self, *, copy_data): expected_pkg = Package() pkg_entries = self.entries.items() if copy_data: pkg_entries = [( lk, e.with_physical_key( PhysicalKey(self.dst_bucket, f'{self.dst_pkg_name}/{lk}', 'dst_' + e.physical_key.version_id)), ) for lk, e in pkg_entries] for lk, entry in pkg_entries: expected_pkg.set(lk, entry) expected_pkg._set_commit_message(None) return expected_pkg
class HashCalculationTest(unittest.TestCase): def setUp(self): self.pkg = Package() self.entry_with_hash = PackageEntry( PhysicalKey('test-bucket', 'with-hash', 'with-hash'), 42, { 'type': 'SHA256', 'value': '0' * 64 }, {}, ) self.entry_without_hash = PackageEntry( PhysicalKey('test-bucket', 'without-hash', 'without-hash'), 42, None, {}, ) self.pkg.set('with-hash', self.entry_with_hash) self.pkg.set('without-hash', self.entry_without_hash) def test_calculate_pkg_hashes(self): boto_session = mock.MagicMock() with mock.patch.object( t4_lambda_pkgpush, 'calculate_pkg_entry_hash') as calculate_pkg_entry_hash_mock: t4_lambda_pkgpush.calculate_pkg_hashes(boto_session, self.pkg) calculate_pkg_entry_hash_mock.assert_called_once_with( mock.ANY, self.entry_without_hash) @mock.patch.object(t4_lambda_pkgpush, 'S3_HASH_LAMBDA_MAX_FILE_SIZE_BYTES', 1) def test_calculate_pkg_hashes_too_large_file_error(self): s3_client = mock.MagicMock() with pytest.raises(t4_lambda_pkgpush.FileTooLargeForHashing): t4_lambda_pkgpush.calculate_pkg_hashes(s3_client, self.pkg) def test_calculate_pkg_entry_hash(self): get_s3_client_mock = mock.MagicMock() s3_client_mock = get_s3_client_mock.return_value s3_client_mock.generate_presigned_url.return_value = 'https://example.com' with mock.patch("t4_lambda_pkgpush.invoke_hash_lambda", return_value='0' * 64) as invoke_hash_lambda_mock: t4_lambda_pkgpush.calculate_pkg_entry_hash(get_s3_client_mock, self.entry_without_hash) get_s3_client_mock.assert_called_once_with( self.entry_without_hash.physical_key.bucket) invoke_hash_lambda_mock.assert_called_once_with( s3_client_mock.generate_presigned_url.return_value) s3_client_mock.generate_presigned_url.assert_called_once_with( ClientMethod='get_object', ExpiresIn=t4_lambda_pkgpush. S3_HASH_LAMBDA_SIGNED_URL_EXPIRES_IN_SECONDS, Params={ 'Bucket': self.entry_without_hash.physical_key.bucket, 'Key': self.entry_without_hash.physical_key.path, 'VersionId': self.entry_without_hash.physical_key.version_id, }, ) assert self.entry_without_hash.hash == { 'type': 'SHA256', 'value': invoke_hash_lambda_mock.return_value, } def test_invoke_hash_lambda(self): lambda_client_stubber = Stubber(t4_lambda_pkgpush.lambda_) lambda_client_stubber.activate() self.addCleanup(lambda_client_stubber.deactivate) test_hash = '0' * 64 test_url = 'https://example.com' lambda_client_stubber.add_response( 'invoke', service_response={ 'Payload': io.BytesIO(b'"%s"' % test_hash.encode()), }, expected_params={ 'FunctionName': t4_lambda_pkgpush.S3_HASH_LAMBDA, 'Payload': '"%s"' % test_url, }, ) assert t4_lambda_pkgpush.invoke_hash_lambda(test_url) == test_hash lambda_client_stubber.assert_no_pending_responses() def test_invoke_hash_lambda_error(self): lambda_client_stubber = Stubber(t4_lambda_pkgpush.lambda_) lambda_client_stubber.activate() self.addCleanup(lambda_client_stubber.deactivate) test_url = 'https://example.com' lambda_client_stubber.add_response( 'invoke', service_response={ 'FunctionError': 'Unhandled', 'Payload': io.BytesIO(b'some error info'), }, expected_params={ 'FunctionName': t4_lambda_pkgpush.S3_HASH_LAMBDA, 'Payload': '"%s"' % test_url, }, ) with pytest.raises(t4_lambda_pkgpush.S3HashLambdaUnhandledError): t4_lambda_pkgpush.invoke_hash_lambda(test_url) lambda_client_stubber.assert_no_pending_responses()
def _mock_package_build(self, entries, *, message=..., expected_workflow=...): if message is ...: message = self.dst_commit_message # Use a test package to verify manifest entries test_pkg = Package() test_pkg.set_meta(self.meta) # Mock hashing package objects for entry in entries: pkey = PhysicalKey.from_url(entry['physical_key']) hash_obj = {'type': 'SHA256', 'value': entry['hash']} test_entry = PackageEntry(pkey, entry['size'], hash_obj, entry.get('meta')) test_pkg.set(entry['logical_key'], entry=test_entry) mocked_workflow_data = 'some-workflow-data' test_pkg._workflow = mocked_workflow_data # build the manifest from the test_package test_pkg._set_commit_message(message) manifest = io.BytesIO() test_pkg.dump(manifest) manifest.seek(0) self.s3_stubber.add_response( 'put_object', service_response={}, expected_params={ 'Body': manifest.read(), 'Bucket': self.dst_bucket, 'Key': f'.quilt/packages/{test_pkg.top_hash}', }, ) self.s3_stubber.add_response( 'put_object', service_response={}, expected_params={ 'Body': str.encode(test_pkg.top_hash), 'Bucket': self.dst_bucket, 'Key': f'.quilt/named_packages/{self.dst_pkg_name}/{str(int(self.mock_timestamp))}', }, ) self.s3_stubber.add_response( 'put_object', service_response={}, expected_params={ 'Body': str.encode(test_pkg.top_hash), 'Bucket': self.dst_bucket, 'Key': f'.quilt/named_packages/{self.dst_pkg_name}/latest', }, ) with mock.patch( 'quilt3.workflows.validate', return_value=mocked_workflow_data) as workflow_validate_mock: yield workflow_validate_mock.assert_called_once_with( registry=get_package_registry(self.dst_registry), workflow=expected_workflow, name=self.dst_pkg_name, pkg=mock.ANY, # TODO: probably this should be more specific. message=message, )
def create_package( manifest: pd.DataFrame, step_pkg_root: Path, filepath_columns: List[str] = ["filepath"], metadata_columns: List[str] = [], ) -> Tuple[Package, pd.DataFrame]: # Make a copy relative_manifest = manifest.copy(deep=True) # Create empty package pkg = Package() # Create associate mappings: List[Dict[str, str]] # This list is in index order. Meaning that as the column values are descended we # can simply add a new associate to the already existing associate map at that list # index. associates = [] # Create metadata reduction map # This will be used to clean up and standardize the metadata access after object # construction. Metadata column name to boolean value for should or should not # reduce metadata values. This will be used during the "clean up the package # metadata step". If we have multiple files each with the same keys for the # metadata, but for one reason or another, one packaged file's value for a certain # key is a list while another's is a single string, this leads to a confusing mixed # return value API for the same _type_ of object. Example: # fov/ # obj1/ # {example_key: "hello"} # obj2/ # {example_key: ["hello", "world"]} # Commonly this happens when a manifest has rows of unique instances of a child # object but retains a reference to a parent object, example: rows of information # about unique cells that were all generated using the same algorithm, whose # information is stored in a column, for each cell information row. This could # result in some files (which only have one cell) being a single string while other # files (which have more than one cell) being a list of the same string over and # over again. "Why spend all this time to reduce/ collapse the metadata anyway?", # besides making it so that users won't have to call `obj2.meta["example_key"][0]` # every time they want the value, and besides the fact that it standardizes the # metadata api, the biggest reason is that S3 objects can only have 2KB of metadata, # without this reduction/ collapse step, manifests are more likely to hit that limit # and cause a package distribution error. metadata_reduction_map = { index_col: True for index_col in metadata_columns } # Set all files with tqdm( total=len(filepath_columns) * len(relative_manifest), desc="Constructing package", ) as pbar: for col in filepath_columns: # Update values to the logical key as they are set for i, val in enumerate(relative_manifest[col].values): # Fully resolve the path physical_key = Path(val).expanduser().resolve() # Try creating a logical key from the relative of step # local staging to the filepath # # Ex: # step_pkg_root = "local_staging/raw" # physical_key = "local_staging/raw/images/some_file.tiff" # produced logical_key = "images/some_file.tiff" try: logical_key = str( file_utils._filepath_rel2abs(physical_key).relative_to( file_utils._filepath_rel2abs(step_pkg_root))) except ValueError: # Create logical key from merging column and filename # Also remove any obvious "path" type words from column name # # Ex: # physical_key = "/some/abs/path/some_file.tiff" # column = "SourceReadPath" # produced logical_key = "source/some_file.tiff" stripped_col = col.lower().replace("read", "").replace("path", "") logical_key = f"{stripped_col}/{physical_key.name}" if physical_key.is_file(): relative_manifest[col].values[i] = logical_key # Create metadata dictionary to attach to object meta = {} for meta_col in metadata_columns: # Short reference to current metadata value v = relative_manifest[meta_col].values[i] # Enforce simple JSON serializable type # First check if value is a numpy value # It likely is because pandas relies on numpy # All numpy types have the "dtype" attribute and can be cast to # python type by using the `item` function, details here: # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.item.html if hasattr(v, "dtype"): v = v.item() # Cast to JSON serializable type v = file_utils.make_json_serializable( v, f"Value from column: {meta_col}, index: {i}") # Update metadata with value meta[meta_col] = [v] # Check if object already exists if logical_key in pkg: # Join the two meta dictionaries joined_meta = {} for meta_col, curr_v in pkg[logical_key].meta.items(): # Join the values for the current iteration of the metadata joined_values = [*curr_v, *meta[meta_col]] # Only check if the metadata at this index can be reduced # if currently is still being decided. We know if the # metadata value at this index is still be decided if: # the boolean value in the metadata reduction map is True, # as in, this index can be reduced or collapsed. # The other reason to make this check is so that we don't # override an earlier False reduction value. In the case # where early on we encounter an instance of the metadata # that should not be reduced but then later on we say it # can be, this check prevents that. As we want all metadata # access across the dataset to be uniform. if metadata_reduction_map[meta_col]: # Update the metadata reduction map # For the current column being checked, as long as it # is still being determined that the column can be # reduced (aka we have entered this if block) check if # we can still reduce the metadata after the recent # addition. "We can reduce the metadata if the count of # the first value (or any value) is the same as the # length of the entire list of values". # This runs quickly for small lists as seen here: # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical metadata_reduction_map[ meta_col] = joined_values.count( joined_values[0]) == len( joined_values) # noqa F501 # Attached the joined values to the joined metadata joined_meta[meta_col] = joined_values # Update meta pkg[logical_key].set_meta(joined_meta) # Object didn't already exist, simply set it else: pkg.set(logical_key, physical_key, meta) # Update associates try: associates[i][col] = logical_key except IndexError: associates.append({col: logical_key}) else: relative_manifest[col].values[i] = logical_key pkg.set_dir(logical_key, physical_key) # Update progress bar pbar.update() # Clean up package metadata pkg = _recursive_clean(pkg, metadata_reduction_map) # Attach associates for i, associate_mapping in tqdm( enumerate(associates), desc="Creating associate metadata blocks"): for col, lk in associate_mapping.items(): # Having dictionary expansion in this order means that associates will # override a prior existing `associates` key, this is assumed safe # because attach_associates was set to True. pkg[lk].set_meta({ **pkg[lk].meta, **{ "associates": associate_mapping } }) return pkg, relative_manifest