Beispiel #1
0
    def test_large_copy(self, num_parts=LAMBDA_PARALLELIZATION_FACTOR + 1):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_client = boto3.client("s3")
        mpu = s3_client.create_multipart_upload(Bucket=test_bucket,
                                                Key=test_src_key)

        with ThreadPoolExecutor(max_workers=8) as tpe:
            parts_futures = tpe.map(
                lambda part_id: TestS3ParallelCopy.upload_part(
                    test_bucket, test_src_key, mpu['UploadId'], part_id),
                range(1, num_parts + 1))

        parts = [
            dict(ETag=part_etag, PartNumber=part_id)
            for part_id, part_etag in parts_futures
        ]

        src_etag = s3_client.complete_multipart_upload(
            Bucket=test_bucket,
            Key=test_src_key,
            MultipartUpload=dict(Parts=parts),
            UploadId=mpu['UploadId'],
        )['ETag'].strip('"')

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Beispiel #2
0
def parallel_copy(replica: Replica, source_bucket: str, source_key: str,
                  destination_bucket: str, destination_key: str) -> str:
    log.debug(
        f"Copy file from bucket {source_bucket} with key {source_key} to "
        f"bucket {destination_bucket} destination file: {destination_key}")

    if replica == Replica.aws:
        state = s3copyclient.copy_sfn_event(
            source_bucket,
            source_key,
            destination_bucket,
            destination_key,
        )
        state_machine_name_template = "dss-s3-copy-sfn-{stage}"
    elif replica == Replica.gcp:
        state = gscopyclient.copy_sfn_event(source_bucket, source_key,
                                            destination_bucket,
                                            destination_key)
        state_machine_name_template = "dss-gs-copy-sfn-{stage}"
    else:
        raise ValueError("Unsupported replica")

    execution_id = get_execution_id()
    stepfunctions.step_functions_invoke(state_machine_name_template,
                                        execution_id, state)
    return execution_id
def post(uuid: str,
         json_request_body: dict,
         replica: str,
         version: str = None):

    assert replica is not None

    bundle = get_bundle(uuid, Replica[replica], version)
    execution_id = get_execution_id()

    sfn_input = {
        "dss_bucket": dss_bucket,
        "bundle": uuid,
        "version": bundle["bundle"]["version"],
        "replica": replica,
        "execution_name": execution_id
    }
    if "destination" in json_request_body:
        sfn_input["bucket"] = json_request_body["destination"]

    if "email" in json_request_body:
        sfn_input["email"] = json_request_body["email"]

    put_status_started(execution_id)

    stepfunctions.step_functions_invoke(STATE_MACHINE_NAME_TEMPLATE,
                                        execution_id, sfn_input)
    return jsonify(dict(checkout_job_id=execution_id)), requests.codes.ok
Beispiel #4
0
def launch_exec(event, context):
    msg = json.loads(event["Records"][0]["Sns"]["Message"])
    run_id = msg["run_id"]
    execution_id = msg["execution_id"]
    nextBatch = roundTime()
    test_input = {
        "execution_id": execution_id,
        "test_run_id": run_id,
        "batch": nextBatch.isoformat() + 'Z'
    }
    logger.debug("Starting execution %s", execution_id)
    stepfunctions.step_functions_invoke("dss-scalability-test-{stage}", execution_id, test_input)
Beispiel #5
0
def start_bundle_checkout(
    replica: Replica,
    bundle_uuid: str,
    bundle_version: typing.Optional[str],
    dst_bucket: str,
    email_address: typing.Optional[str] = None,
    *,
    sts_bucket: typing.Optional[str] = None,
) -> str:
    """
    Starts a bundle checkout.

    :param bundle_uuid: The UUID of the bundle to check out.
    :param bundle_version: The version of the bundle to check out.  If this is not provided, the latest version of the
                           bundle is checked out.
    :param replica: The replica to execute the checkout in.
    :param dst_bucket: Check out to this bucket.
    :param email_address: If provided, send a message to this email address with the status of the checkout.
    :param sts_bucket: If provided, write the status of the checkout to this bucket.  If not provided, write the status
                       to the default checkout bucket for the replica.
    :return: The execution ID of the request.
    """

    bundle = get_bundle_manifest(bundle_uuid, replica, bundle_version)
    if bundle is None:
        raise BundleNotFoundError()
    execution_id = get_execution_id()
    if sts_bucket is None:
        sts_bucket = replica.checkout_bucket

    sfn_input = {
        EventConstants.DSS_BUCKET: replica.bucket,
        EventConstants.STATUS_BUCKET: sts_bucket,
        EventConstants.BUNDLE_UUID: bundle_uuid,
        EventConstants.BUNDLE_VERSION: bundle[BundleMetadata.VERSION],
        EventConstants.REPLICA: replica.name,
        EventConstants.EXECUTION_ID: execution_id
    }
    if dst_bucket is not None:
        sfn_input[EventConstants.DST_BUCKET] = dst_bucket

    if email_address is not None:
        sfn_input[EventConstants.EMAIL] = email_address

    mark_bundle_checkout_started(execution_id, replica, sts_bucket)

    stepfunctions.step_functions_invoke(STATE_MACHINE_NAME_TEMPLATE,
                                        execution_id, sfn_input)
    return execution_id
Beispiel #6
0
    def test_zero_copy(self):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_blobstore = Config.get_blobstore_handle(Replica.aws)

        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.seek(0)
            s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh)

        src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key)

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Beispiel #7
0
    def test_simple_copy(self):
        dest_key = infra.generate_test_key()

        state = copy_sfn_event(self.test_bucket, self.src_key,
                               self.test_bucket, dest_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-gs-copy-sfn-{stage}",
                                            execution_id, state)

        # verify that the destination has the same checksum.
        src_checksum = self.gs_blobstore.get_cloud_checksum(
            self.test_bucket, self.src_key)

        @eventually(30.0, 1.0, {BlobNotFoundError, AssertionError})
        def test_output():
            dst_checksum = self.gs_blobstore.get_cloud_checksum(
                self.test_bucket, dest_key)
            self.assertEqual(src_checksum, dst_checksum)

        test_output()
def put(uuid: str, json_request_body: dict, version: str = None):
    class CopyMode(Enum):
        NO_COPY = auto()
        COPY_INLINE = auto()
        COPY_ASYNC = auto()

    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    source_url = json_request_body['source_url']
    cre = re.compile("^"
                     "(?P<schema>(?:s3|gs|wasb))"
                     "://"
                     "(?P<bucket>[^/]+)"
                     "/"
                     "(?P<key>.+)"
                     "$")
    mobj = cre.match(source_url)
    if mobj and mobj.group('schema') == "s3":
        replica = Replica.aws
    elif mobj and mobj.group('schema') == "gs":
        replica = Replica.gcp
    else:
        schema = mobj.group('schema')
        raise DSSException(requests.codes.bad_request, "unknown_source_schema",
                           f"source_url schema {schema} not supported")

    handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)
    dst_bucket = replica.bucket

    src_bucket = mobj.group('bucket')
    src_key = mobj.group('key')

    metadata = handle.get_user_metadata(src_bucket, src_key)
    size = handle.get_size(src_bucket, src_key)
    content_type = handle.get_content_type(src_bucket, src_key)

    # format all the checksums so they're lower-case.
    for metadata_spec in HCABlobStore.MANDATORY_METADATA.values():
        if metadata_spec['downcase']:
            keyname = typing.cast(str, metadata_spec['keyname'])
            metadata[keyname] = metadata[keyname].lower()

    # what's the target object name for the actual data?
    dst_key = ("blobs/" + ".".join((
        metadata['hca-dss-sha256'],
        metadata['hca-dss-sha1'],
        metadata['hca-dss-s3_etag'],
        metadata['hca-dss-crc32c'],
    ))).lower()

    # does it exist? if so, we can skip the copy part.
    copy_mode = CopyMode.COPY_INLINE
    try:
        if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata):
            copy_mode = CopyMode.NO_COPY
    except BlobNotFoundError:
        pass

    # build the json document for the file metadata.
    file_metadata = {
        FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION,
        FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'],
        FileMetadata.CREATOR_UID: json_request_body['creator_uid'],
        FileMetadata.VERSION: version,
        FileMetadata.CONTENT_TYPE: content_type,
        FileMetadata.SIZE: size,
        FileMetadata.CRC32C: metadata['hca-dss-crc32c'],
        FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'],
        FileMetadata.SHA1: metadata['hca-dss-sha1'],
        FileMetadata.SHA256: metadata['hca-dss-sha256'],
    }
    file_metadata_json = json.dumps(file_metadata)

    if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD:
        copy_mode = CopyMode.COPY_ASYNC

    if copy_mode == CopyMode.COPY_ASYNC:
        if replica == Replica.aws:
            state = s3copyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}"
        elif replica == Replica.gcp:
            state = gscopyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}"
        else:
            raise ValueError("Unhandled replica")

        execution_id = str(uuid4())
        stepfunctions.step_functions_invoke(state_machine_name_template,
                                            execution_id, state)
        return jsonify(dict(task_id=execution_id,
                            version=version)), requests.codes.accepted
    elif copy_mode == CopyMode.COPY_INLINE:
        handle.copy(src_bucket, src_key, dst_bucket, dst_key)

        # verify the copy was done correctly.
        assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata)

    try:
        write_file_metadata(handle, dst_bucket, uuid, version,
                            file_metadata_json)
        status_code = requests.codes.created
    except BlobAlreadyExistsError:
        # fetch the file metadata, compare it to what we have.
        existing_file_metadata = json.loads(
            handle.get(dst_bucket,
                       "files/{}.{}".format(uuid, version)).decode("utf-8"))
        if existing_file_metadata != file_metadata:
            raise DSSException(
                requests.codes.conflict, "file_already_exists",
                f"file with UUID {uuid} and version {version} already exists")
        status_code = requests.codes.ok

    return jsonify(dict(version=version)), status_code