def should_log_s3_batch_response(self, head_object_mock: MagicMock, create_job_mock: MagicMock) -> None: # Given create_job_mock.return_value = response = {"JobId": "Some Response"} expected_response_log = json.dumps({S3_BATCH_RESPONSE_KEY: response}) head_object_mock.return_value = {"ETag": any_etag()} with Dataset() as dataset, patch.object( self.logger, "debug") as logger_mock, patch( "backend.import_dataset.task.smart_open"): # When lambda_handler( { DATASET_ID_KEY: dataset.dataset_id, DATASET_PREFIX_KEY: dataset.dataset_prefix, METADATA_URL_KEY: any_s3_url(), VERSION_ID_KEY: any_dataset_version_id(), }, any_lambda_context(), ) # Then logger_mock.assert_any_call(expected_response_log)
def should_log_assets_added_to_manifest( self, head_object_mock: MagicMock, subtests: SubTests, ) -> None: # Given with Dataset() as dataset: version_id = any_dataset_version_id() asset_id = (f"{DATASET_ID_PREFIX}{dataset.dataset_id}" f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}") head_object_mock.return_value = {"ETag": any_etag()} with ProcessingAsset( asset_id=asset_id, multihash=None, url=any_s3_url() ) as metadata_processing_asset, ProcessingAsset( asset_id=asset_id, multihash=any_hex_multihash(), url=any_s3_url(), ) as processing_asset, patch.object( self.logger, "debug" ) as logger_mock, patch( "backend.import_dataset.task.smart_open" ), patch( "backend.import_dataset.task.S3CONTROL_CLIENT.create_job"): expected_asset_log = dumps( {"Adding file to manifest": processing_asset.url}) expected_metadata_log = dumps( {"Adding file to manifest": metadata_processing_asset.url}) # When lambda_handler( { DATASET_ID_KEY: dataset.dataset_id, DATASET_PREFIX_KEY: dataset.dataset_prefix, METADATA_URL_KEY: any_s3_url(), VERSION_ID_KEY: version_id, }, any_lambda_context(), ) # Then with subtests.test(): logger_mock.assert_any_call(expected_asset_log) with subtests.test(): logger_mock.assert_any_call(expected_metadata_log)
def should_log_assets_added_to_manifest( self, head_object_mock: MagicMock, create_job_mock: MagicMock, # pylint:disable=unused-argument subtests: SubTests, ) -> None: # Given with Dataset() as dataset: version_id = any_dataset_version_id() asset_id = f"DATASET#{dataset.dataset_id}#VERSION#{version_id}" head_object_mock.return_value = {"ETag": any_etag()} with ProcessingAsset( asset_id=asset_id, multihash=None, url=any_s3_url( )) as metadata_processing_asset, ProcessingAsset( asset_id=asset_id, multihash=any_hex_multihash(), url=any_s3_url(), ) as processing_asset, patch.object( self.logger, "debug") as logger_mock, patch( "backend.import_dataset.task.smart_open"): expected_asset_log = dumps( {"Adding file to manifest": processing_asset.url}) expected_metadata_log = dumps( {"Adding file to manifest": metadata_processing_asset.url}) # When lambda_handler( { DATASET_ID_KEY: dataset.dataset_id, METADATA_URL_KEY: any_s3_url(), VERSION_ID_KEY: version_id, }, any_lambda_context(), ) # Then with subtests.test(): logger_mock.assert_any_call(expected_asset_log) with subtests.test(): logger_mock.assert_any_call(expected_metadata_log)
def should_log_schema_validation_warning( self, validate_schema_mock: MagicMock) -> None: # Given error_message = "Some error message" validate_schema_mock.side_effect = ValidationError(error_message) expected_log = dumps({ERROR_KEY: error_message}) with patch.object(self.logger, "warning") as logger_mock: # When lambda_handler( { METADATA_URL_KEY: any_s3_url(), VERSION_ID_KEY: any_dataset_version_id() }, any_lambda_context(), ) # Then logger_mock.assert_any_call(expected_log)
def should_return_required_property_error_when_missing_version_id() -> None: # When response = lambda_handler( { DATASET_ID_KEY: any_dataset_id(), METADATA_URL_KEY: any_s3_url() }, any_lambda_context()) assert response == { ERROR_MESSAGE_KEY: f"'{VERSION_ID_KEY}' is a required property" }
def should_log_payload(self, head_object_mock: MagicMock) -> None: # Given head_object_mock.return_value = {"ETag": any_etag()} with patch("backend.import_dataset.task.S3CONTROL_CLIENT.create_job" ), Dataset() as dataset, patch.object( self.logger, "debug") as logger_mock, patch( "backend.import_dataset.task.validate"), patch( "backend.import_dataset.task.smart_open"): event = { DATASET_ID_KEY: dataset.dataset_id, DATASET_PREFIX_KEY: dataset.dataset_prefix, METADATA_URL_KEY: any_s3_url(), VERSION_ID_KEY: any_dataset_version_id(), } expected_payload_log = dumps({EVENT_KEY: event}) # When lambda_handler(event, any_lambda_context()) # Then logger_mock.assert_any_call(expected_payload_log)
def should_batch_copy_files_to_storage( s3_client: S3Client, s3_control_client: S3ControlClient, sts_client: STSClient, subtests: SubTests, ) -> None: # pylint: disable=too-many-locals # Given two metadata files with an asset each, all within a prefix original_prefix = any_safe_filename() root_asset_name = any_asset_name() root_asset_filename = any_safe_filename() root_asset_content = any_file_contents() root_asset_multihash = sha256_hex_digest_to_multihash( sha256(root_asset_content).hexdigest()) child_asset_name = any_asset_name() child_asset_filename = any_safe_filename() child_asset_content = any_file_contents() child_asset_multihash = sha256_hex_digest_to_multihash( sha256(child_asset_content).hexdigest()) root_metadata_filename = any_safe_filename() child_metadata_filename = any_safe_filename() with S3Object( BytesIO(initial_bytes=root_asset_content), ResourceName.STAGING_BUCKET_NAME.value, f"{original_prefix}/{root_asset_filename}", ) as root_asset_s3_object, S3Object( BytesIO(initial_bytes=child_asset_content), ResourceName.STAGING_BUCKET_NAME.value, f"{original_prefix}/{child_asset_filename}", ) as child_asset_s3_object, S3Object( BytesIO(initial_bytes=dumps({ **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT), STAC_ASSETS_KEY: { child_asset_name: { STAC_HREF_KEY: child_asset_s3_object.url, STAC_FILE_CHECKSUM_KEY: child_asset_multihash, } }, }).encode()), ResourceName.STAGING_BUCKET_NAME.value, f"{original_prefix}/{child_metadata_filename}", ) as child_metadata_s3_object, S3Object( BytesIO(initial_bytes=dumps({ **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT), STAC_ASSETS_KEY: { root_asset_name: { STAC_HREF_KEY: root_asset_s3_object.url, STAC_FILE_CHECKSUM_KEY: root_asset_multihash, }, }, STAC_LINKS_KEY: [{ STAC_HREF_KEY: child_metadata_s3_object.url, "rel": "child" }], }).encode()), ResourceName.STAGING_BUCKET_NAME.value, f"{original_prefix}/{root_metadata_filename}", ) as root_metadata_s3_object, Dataset() as dataset: version_id = any_dataset_version_id() asset_id = (f"{DATASET_ID_PREFIX}{dataset.dataset_id}" f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}") with ProcessingAsset( asset_id=asset_id, url=root_metadata_s3_object.url), ProcessingAsset( asset_id=asset_id, url=child_metadata_s3_object.url), ProcessingAsset( asset_id=asset_id, url=root_asset_s3_object.url, multihash=root_asset_multihash), ProcessingAsset( asset_id=asset_id, url=child_asset_s3_object.url, multihash=child_asset_multihash): # When try: response = lambda_handler( { DATASET_ID_KEY: dataset.dataset_id, DATASET_PREFIX_KEY: dataset.dataset_prefix, VERSION_ID_KEY: version_id, METADATA_URL_KEY: root_metadata_s3_object.url, }, any_lambda_context(), ) account_id = sts_client.get_caller_identity()["Account"] metadata_copy_job_result, asset_copy_job_result = wait_for_copy_jobs( response, account_id, s3_control_client, subtests, ) finally: # Then new_prefix = ( f"{dataset.title}{DATASET_KEY_SEPARATOR}{dataset.dataset_id}/{version_id}" ) storage_bucket_prefix = f"{S3_URL_PREFIX}{ResourceName.STORAGE_BUCKET_NAME.value}/" new_root_metadata_key = f"{new_prefix}/{root_metadata_filename}" expected_root_metadata = dumps({ **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT), STAC_ASSETS_KEY: { root_asset_name: { STAC_HREF_KEY: root_asset_filename, STAC_FILE_CHECKSUM_KEY: root_asset_multihash, }, }, STAC_LINKS_KEY: [{ STAC_HREF_KEY: child_metadata_filename, "rel": "child" }], }).encode() with subtests.test(msg="Root metadata content"), smart_open( f"{storage_bucket_prefix}{new_root_metadata_key}" ) as new_root_metadata_file: assert expected_root_metadata == new_root_metadata_file.read( ) with subtests.test(msg="Delete root metadata object"): delete_s3_key(ResourceName.STORAGE_BUCKET_NAME.value, new_root_metadata_key, s3_client) new_child_metadata_key = f"{new_prefix}/{child_metadata_filename}" expected_child_metadata = dumps({ **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT), STAC_ASSETS_KEY: { child_asset_name: { STAC_HREF_KEY: child_asset_filename, STAC_FILE_CHECKSUM_KEY: child_asset_multihash, } }, }).encode() with subtests.test(msg="Child metadata content"), smart_open( f"{storage_bucket_prefix}{new_child_metadata_key}" ) as new_child_metadata_file: assert expected_child_metadata == new_child_metadata_file.read( ) with subtests.test(msg="Delete child metadata object"): delete_s3_key(ResourceName.STORAGE_BUCKET_NAME.value, new_child_metadata_key, s3_client) # Then the root asset file is in the root prefix with subtests.test(msg="Delete root asset object"): delete_s3_key( ResourceName.STORAGE_BUCKET_NAME.value, f"{new_prefix}/{root_asset_filename}", s3_client, ) # Then the child asset file is in the root prefix with subtests.test(msg="Delete child asset object"): delete_s3_key( ResourceName.STORAGE_BUCKET_NAME.value, f"{new_prefix}/{child_asset_filename}", s3_client, ) # Cleanup delete_copy_job_files( metadata_copy_job_result, asset_copy_job_result, ResourceName.STORAGE_BUCKET_NAME.value, s3_client, subtests, )