def test_large_copy(self, num_parts=LAMBDA_PARALLELIZATION_FACTOR + 1): test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") test_src_key = infra.generate_test_key() s3_client = boto3.client("s3") mpu = s3_client.create_multipart_upload(Bucket=test_bucket, Key=test_src_key) with ThreadPoolExecutor(max_workers=8) as tpe: parts_futures = tpe.map( lambda part_id: TestS3ParallelCopy.upload_part( test_bucket, test_src_key, mpu['UploadId'], part_id), range(1, num_parts + 1)) parts = [ dict(ETag=part_etag, PartNumber=part_id) for part_id, part_etag in parts_futures ] src_etag = s3_client.complete_multipart_upload( Bucket=test_bucket, Key=test_src_key, MultipartUpload=dict(Parts=parts), UploadId=mpu['UploadId'], )['ETag'].strip('"') test_dst_key = infra.generate_test_key() state = s3copyclient.copy_sfn_event(test_bucket, test_src_key, test_bucket, test_dst_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}", execution_id, state) self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
def parallel_copy(replica: Replica, source_bucket: str, source_key: str, destination_bucket: str, destination_key: str) -> str: log.debug( f"Copy file from bucket {source_bucket} with key {source_key} to " f"bucket {destination_bucket} destination file: {destination_key}") if replica == Replica.aws: state = s3copyclient.copy_sfn_event( source_bucket, source_key, destination_bucket, destination_key, ) state_machine_name_template = "dss-s3-copy-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_sfn_event(source_bucket, source_key, destination_bucket, destination_key) state_machine_name_template = "dss-gs-copy-sfn-{stage}" else: raise ValueError("Unsupported replica") execution_id = get_execution_id() stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return execution_id
def post(uuid: str, json_request_body: dict, replica: str, version: str = None): assert replica is not None bundle = get_bundle(uuid, Replica[replica], version) execution_id = get_execution_id() sfn_input = { "dss_bucket": dss_bucket, "bundle": uuid, "version": bundle["bundle"]["version"], "replica": replica, "execution_name": execution_id } if "destination" in json_request_body: sfn_input["bucket"] = json_request_body["destination"] if "email" in json_request_body: sfn_input["email"] = json_request_body["email"] put_status_started(execution_id) stepfunctions.step_functions_invoke(STATE_MACHINE_NAME_TEMPLATE, execution_id, sfn_input) return jsonify(dict(checkout_job_id=execution_id)), requests.codes.ok
def launch_exec(event, context): msg = json.loads(event["Records"][0]["Sns"]["Message"]) run_id = msg["run_id"] execution_id = msg["execution_id"] nextBatch = roundTime() test_input = { "execution_id": execution_id, "test_run_id": run_id, "batch": nextBatch.isoformat() + 'Z' } logger.debug("Starting execution %s", execution_id) stepfunctions.step_functions_invoke("dss-scalability-test-{stage}", execution_id, test_input)
def start_bundle_checkout( replica: Replica, bundle_uuid: str, bundle_version: typing.Optional[str], dst_bucket: str, email_address: typing.Optional[str] = None, *, sts_bucket: typing.Optional[str] = None, ) -> str: """ Starts a bundle checkout. :param bundle_uuid: The UUID of the bundle to check out. :param bundle_version: The version of the bundle to check out. If this is not provided, the latest version of the bundle is checked out. :param replica: The replica to execute the checkout in. :param dst_bucket: Check out to this bucket. :param email_address: If provided, send a message to this email address with the status of the checkout. :param sts_bucket: If provided, write the status of the checkout to this bucket. If not provided, write the status to the default checkout bucket for the replica. :return: The execution ID of the request. """ bundle = get_bundle_manifest(bundle_uuid, replica, bundle_version) if bundle is None: raise BundleNotFoundError() execution_id = get_execution_id() if sts_bucket is None: sts_bucket = replica.checkout_bucket sfn_input = { EventConstants.DSS_BUCKET: replica.bucket, EventConstants.STATUS_BUCKET: sts_bucket, EventConstants.BUNDLE_UUID: bundle_uuid, EventConstants.BUNDLE_VERSION: bundle[BundleMetadata.VERSION], EventConstants.REPLICA: replica.name, EventConstants.EXECUTION_ID: execution_id } if dst_bucket is not None: sfn_input[EventConstants.DST_BUCKET] = dst_bucket if email_address is not None: sfn_input[EventConstants.EMAIL] = email_address mark_bundle_checkout_started(execution_id, replica, sts_bucket) stepfunctions.step_functions_invoke(STATE_MACHINE_NAME_TEMPLATE, execution_id, sfn_input) return execution_id
def test_zero_copy(self): test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") test_src_key = infra.generate_test_key() s3_blobstore = Config.get_blobstore_handle(Replica.aws) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.seek(0) s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh) src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key) test_dst_key = infra.generate_test_key() state = s3copyclient.copy_sfn_event(test_bucket, test_src_key, test_bucket, test_dst_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}", execution_id, state) self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
def test_simple_copy(self): dest_key = infra.generate_test_key() state = copy_sfn_event(self.test_bucket, self.src_key, self.test_bucket, dest_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-gs-copy-sfn-{stage}", execution_id, state) # verify that the destination has the same checksum. src_checksum = self.gs_blobstore.get_cloud_checksum( self.test_bucket, self.src_key) @eventually(30.0, 1.0, {BlobNotFoundError, AssertionError}) def test_output(): dst_checksum = self.gs_blobstore.get_cloud_checksum( self.test_bucket, dest_key) self.assertEqual(src_checksum, dst_checksum) test_output()
def put(uuid: str, json_request_body: dict, version: str = None): class CopyMode(Enum): NO_COPY = auto() COPY_INLINE = auto() COPY_ASYNC = auto() uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) source_url = json_request_body['source_url'] cre = re.compile("^" "(?P<schema>(?:s3|gs|wasb))" "://" "(?P<bucket>[^/]+)" "/" "(?P<key>.+)" "$") mobj = cre.match(source_url) if mobj and mobj.group('schema') == "s3": replica = Replica.aws elif mobj and mobj.group('schema') == "gs": replica = Replica.gcp else: schema = mobj.group('schema') raise DSSException(requests.codes.bad_request, "unknown_source_schema", f"source_url schema {schema} not supported") handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) dst_bucket = replica.bucket src_bucket = mobj.group('bucket') src_key = mobj.group('key') metadata = handle.get_user_metadata(src_bucket, src_key) size = handle.get_size(src_bucket, src_key) content_type = handle.get_content_type(src_bucket, src_key) # format all the checksums so they're lower-case. for metadata_spec in HCABlobStore.MANDATORY_METADATA.values(): if metadata_spec['downcase']: keyname = typing.cast(str, metadata_spec['keyname']) metadata[keyname] = metadata[keyname].lower() # what's the target object name for the actual data? dst_key = ("blobs/" + ".".join(( metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'], ))).lower() # does it exist? if so, we can skip the copy part. copy_mode = CopyMode.COPY_INLINE try: if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata): copy_mode = CopyMode.NO_COPY except BlobNotFoundError: pass # build the json document for the file metadata. file_metadata = { FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION, FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'], FileMetadata.CREATOR_UID: json_request_body['creator_uid'], FileMetadata.VERSION: version, FileMetadata.CONTENT_TYPE: content_type, FileMetadata.SIZE: size, FileMetadata.CRC32C: metadata['hca-dss-crc32c'], FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'], FileMetadata.SHA1: metadata['hca-dss-sha1'], FileMetadata.SHA256: metadata['hca-dss-sha256'], } file_metadata_json = json.dumps(file_metadata) if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD: copy_mode = CopyMode.COPY_ASYNC if copy_mode == CopyMode.COPY_ASYNC: if replica == Replica.aws: state = s3copyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}" else: raise ValueError("Unhandled replica") execution_id = str(uuid4()) stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return jsonify(dict(task_id=execution_id, version=version)), requests.codes.accepted elif copy_mode == CopyMode.COPY_INLINE: handle.copy(src_bucket, src_key, dst_bucket, dst_key) # verify the copy was done correctly. assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata) try: write_file_metadata(handle, dst_bucket, uuid, version, file_metadata_json) status_code = requests.codes.created except BlobAlreadyExistsError: # fetch the file metadata, compare it to what we have. existing_file_metadata = json.loads( handle.get(dst_bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) if existing_file_metadata != file_metadata: raise DSSException( requests.codes.conflict, "file_already_exists", f"file with UUID {uuid} and version {version} already exists") status_code = requests.codes.ok return jsonify(dict(version=version)), status_code