コード例 #1
0
def get_helper(uuid: str, replica: Replica, version: str = None):
    handle = Config.get_blobstore_handle(replica)
    bucket = replica.bucket

    if version is None:
        # list the files and find the one that is the most recent.
        prefix = "files/{}.".format(uuid)
        for matching_file in handle.list(bucket, prefix):
            matching_file = matching_file[len(prefix):]
            if version is None or matching_file > version:
                version = matching_file

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    # retrieve the file metadata.
    try:
        file_metadata = json.loads(
            handle.get(bucket, "files/{}.{}".format(uuid,
                                                    version)).decode("utf-8"))
    except BlobNotFoundError as ex:
        raise DSSException(404, "not_found", "Cannot find file!")

    blob_path = "blobs/" + ".".join((
        file_metadata[FileMetadata.SHA256],
        file_metadata[FileMetadata.SHA1],
        file_metadata[FileMetadata.S3_ETAG],
        file_metadata[FileMetadata.CRC32C],
    ))

    if request.method == "GET":
        """
        Probabilistically return "Retry-After" header
        The retry-after interval can be relatively short now, but it sets up downstream
        libraries / users for success when we start integrating this with the checkout service.
        """
        if random.randint(0, 100) < REDIRECT_PROBABILITY_PERCENTS:
            response = redirect(request.url, code=301)
            headers = response.headers
            headers['Retry-After'] = RETRY_AFTER_INTERVAL
            return response

        response = redirect(
            handle.generate_presigned_GET_url(bucket, blob_path))
    else:
        response = make_response('', 200)

    headers = response.headers
    headers['X-DSS-BUNDLE-UUID'] = file_metadata[FileMetadata.BUNDLE_UUID]
    headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID]
    headers['X-DSS-VERSION'] = version
    headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE]
    headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE]
    headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C]
    headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG]
    headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1]
    headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256]

    return response
コード例 #2
0
def delete(uuid: str,
           replica: str,
           json_request_body: dict,
           version: str = None):
    email = request.token_info['email']

    if email not in ADMIN_USER_EMAILS:
        raise DSSException(
            requests.codes.forbidden,
            "forbidden",
            f"You can't delete bundles with these credentials!",
        )

    uuid = uuid.lower()
    version = datetime_to_version_format(
        iso8601.parse_date(version)) if version else None

    tombstone_id = TombstoneID(uuid=uuid, version=version)
    bundle_prefix = tombstone_id.to_key_prefix()
    tombstone_object_data = _create_tombstone_data(
        email=email,
        reason=json_request_body.get('reason'),
        version=version,
    )

    handle = Config.get_blobstore_handle(Replica[replica])
    bucket = Replica[replica].bucket

    if test_object_exists(handle,
                          bucket,
                          bundle_prefix,
                          test_type=ObjectTest.PREFIX):
        created, idempotent = _idempotent_save(handle, bucket,
                                               tombstone_id.to_key(),
                                               tombstone_object_data)
        if not idempotent:
            raise DSSException(
                requests.codes.conflict,
                f"bundle_tombstone_already_exists",
                f"bundle tombstone with UUID {uuid} and version {version} already exists",
            )
        status_code = requests.codes.ok
        response_body = dict()  # type: dict
    else:
        status_code = requests.codes.not_found
        response_body = dict(title="bundle not found")

    return jsonify(response_body), status_code
コード例 #3
0
def patch(uuid: str, json_request_body: dict, replica: str, version: str):
    bundle = get_bundle_manifest(uuid, Replica[replica], version)
    if bundle is None:
        raise DSSException(404, "not_found",
                           "Could not find bundle for UUID {}".format(uuid))

    remove_files_set = {
        bundle_file_id_metadata(f)
        for f in json_request_body.get("remove_files", [])
    }
    bundle['files'] = [
        f for f in bundle['files']
        if bundle_file_id_metadata(f) not in remove_files_set
    ]
    add_files = json_request_body.get("add_files", [])
    bundle['files'].extend(
        build_bundle_file_metadata(Replica[replica], add_files))
    detect_filename_collisions(bundle['files'])

    timestamp = datetime.datetime.utcnow()
    new_bundle_version = datetime_to_version_format(timestamp)
    bundle['version'] = new_bundle_version
    _save_bundle(Replica[replica], uuid, new_bundle_version, bundle)
    return jsonify(dict(uuid=uuid,
                        version=new_bundle_version)), requests.codes.ok
コード例 #4
0
def _save_bundle(replica: Replica, uuid: str, version: str,
                 bundle_metadata: dict) -> int:
    try:
        created, idempotent = save_bundle_manifest(replica, uuid, version,
                                                   bundle_metadata)
    except BlobStoreTimeoutError:
        raise DSSException(
            requests.codes.unavailable, "service_unavailable",
            f"Service unavailable due to unusually high load/latency")

    if not idempotent:
        raise DSSException(
            requests.codes.conflict, "bundle_already_exists",
            f"bundle with UUID {uuid} and version {version} already exists")
    status_code = requests.codes.created if created else requests.codes.ok

    return status_code
コード例 #5
0
def detect_filename_collisions(bundle_file_metadata):
    filenames: typing.Set[str] = set()
    for _file in bundle_file_metadata:
        name = _file[BundleFileMetadata.NAME]
        if name not in filenames:
            filenames.add(name)
        else:
            raise DSSException(
                requests.codes.bad_request, "duplicate_filename",
                f"Duplicate file name detected: {name}. This test fails on the first occurance. Please check bundle "
                "layout to ensure no duplicated file names are present.")
コード例 #6
0
def validate(definitions: Definitions) -> None:
    """
    Validate the given attachment definitions. This should be called in a request handling context as it raises
    DSSException referring to HTTP status code, as well as error code and description.
    """
    for name, definition in definitions.items():
        if name.startswith('_'):
            raise DSSException(
                requests.codes.bad_request, "invalid_attachment_name",
                f"Attachment names must not start with underscore ({name})")
        type_ = definition['type']
        if type_ == 'jmespath':
            expression = definition['expression']
            try:
                jmespath.compile(expression)
            except JMESPathError as e:
                raise DSSException(
                    requests.codes.bad_request,
                    "invalid_attachment_expression",
                    f"Unable to compile JMESPath expression for attachment {name}"
                ) from e
        else:
            assert False, type_
コード例 #7
0
def delete(uuid: str,
           replica: str,
           json_request_body: dict,
           version: str = None):
    email = security.get_token_email(request.token_info)

    if email not in ADMIN_USER_EMAILS:
        raise DSSForbiddenException(
            "You can't delete bundles with these credentials!")

    uuid = uuid.lower()
    tombstone_id = BundleTombstoneID(uuid=uuid, version=version)
    bundle_prefix = tombstone_id.to_key_prefix()
    tombstone_object_data = _create_tombstone_data(
        email=email,
        reason=json_request_body.get('reason'),
        version=version,
    )

    handle = Config.get_blobstore_handle(Replica[replica])
    if not test_object_exists(handle,
                              Replica[replica].bucket,
                              bundle_prefix,
                              test_type=ObjectTest.PREFIX):
        raise DSSException(404, "not_found", "Cannot find bundle!")

    created, idempotent = idempotent_save(
        handle, Replica[replica].bucket, tombstone_id.to_key(),
        json.dumps(tombstone_object_data).encode("utf-8"))
    if not idempotent:
        raise DSSException(
            requests.codes.conflict,
            f"bundle_tombstone_already_exists",
            f"bundle tombstone with UUID {uuid} and version {version} already exists",
        )

    return dict(), requests.codes.ok
コード例 #8
0
ファイル: files.py プロジェクト: HumanCellAtlas/data-store
def _verify_checkout(
        replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str,
) -> typing.Tuple[str, bool]:
    cloud_handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)

    try:
        now = datetime.datetime.now(datetime.timezone.utc)
        creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path)
        stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']))
        expiration_date = (creation_date
                           + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS']))
                           - datetime.timedelta(hours=1))

        if now < expiration_date:
            if now > stale_after_date:
                start_file_checkout(replica, blob_path)
            if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket,
                                                                 blob_path,
                                                                 file_metadata):
                return "", True
            else:
                logger.error(
                    f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}")
    except BlobNotFoundError:
        pass

    decoded_token: dict
    if token is None:
        execution_id = start_file_checkout(replica, blob_path)
        start_time = time.time()
        attempts = 0

        decoded_token = {
            CheckoutTokenKeys.EXECUTION_ID: execution_id,
            CheckoutTokenKeys.START_TIME: start_time,
            CheckoutTokenKeys.ATTEMPTS: attempts
        }
    else:
        try:
            decoded_token = json.loads(token)
            decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1
        except (KeyError, ValueError) as ex:
            raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex)

    encoded_token = json.dumps(decoded_token)
    return encoded_token, False
コード例 #9
0
def get(
    uuid: str,
    replica: str,
    per_page: int,
    version: str = None,
    directurls: bool = False,
    presignedurls: bool = False,
    token: str = None,
    start_at: int = 0,
):
    if directurls and presignedurls:
        raise DSSException(
            requests.codes.bad_request, "only_one_urltype",
            "only enable one of `directurls` or `presignedurls`")

    _replica = Replica[replica]
    bundle_metadata = get_bundle_manifest(uuid, _replica, version)
    if bundle_metadata is None:
        raise DSSException(404, "not_found", "Cannot find bundle!")
    if version is None:
        version = bundle_metadata[BundleMetadata.VERSION]

    if directurls or presignedurls:
        try:
            token, ready = verify_checkout(_replica, uuid, version, token)
        except TokenError as ex:
            raise DSSException(requests.codes.bad_request, "illegal_token",
                               "Could not understand token", ex)
        except CheckoutError as ex:
            raise DSSException(requests.codes.server_error, "checkout_error",
                               "Could not complete checkout", ex)
        if not ready:
            builder = UrlBuilder(request.url)
            builder.replace_query("token", token)
            response = redirect(str(builder), code=requests.codes.moved)
            headers = response.headers
            headers['Retry-After'] = RETRY_AFTER_INTERVAL
            return response

    all_files = bundle_metadata[BundleMetadata.FILES]

    link = None
    if len(all_files) - start_at > per_page:
        next_url = UrlBuilder(request.url)
        next_url.replace_query("start_at", str(start_at + per_page))
        next_url.replace_query("version", version)
        next_url.replace_query("token", token)
        link = f"<{next_url}>; rel='next'"

    files = all_files[start_at:start_at + per_page]

    filesresponse = []  # type: typing.List[dict]
    for _file in files:
        file_version = {
            'name': _file[BundleFileMetadata.NAME],
            'content-type': _file[BundleFileMetadata.CONTENT_TYPE],
            'size': _file[BundleFileMetadata.SIZE],
            'uuid': _file[BundleFileMetadata.UUID],
            'version': _file[BundleFileMetadata.VERSION],
            'crc32c': _file[BundleFileMetadata.CRC32C],
            's3_etag': _file[BundleFileMetadata.S3_ETAG],
            'sha1': _file[BundleFileMetadata.SHA1],
            'sha256': _file[BundleFileMetadata.SHA256],
            'indexed': _file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=_replica.storage_schema,
                netloc=_replica.checkout_bucket,
                path="{}/{}".format(
                    get_dst_bundle_prefix(
                        uuid, bundle_metadata[BundleMetadata.VERSION]),
                    _file[BundleFileMetadata.NAME],
                ),
            ))
        elif presignedurls:
            handle = Config.get_blobstore_handle(_replica)
            file_version['url'] = handle.generate_presigned_GET_url(
                _replica.checkout_bucket,
                "{}/{}".format(
                    get_dst_bundle_prefix(
                        uuid, bundle_metadata[BundleMetadata.VERSION]),
                    _file[BundleFileMetadata.NAME],
                ),
            )
        filesresponse.append(file_version)

    response_body = dict(bundle=dict(
        uuid=uuid,
        version=bundle_metadata[BundleMetadata.VERSION],
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))

    if link is None:
        response = make_response(jsonify(response_body), requests.codes.ok)
        response.headers['X-OpenAPI-Pagination'] = 'false'
    else:
        response = make_response(jsonify(response_body),
                                 requests.codes.partial)
        response.headers['X-OpenAPI-Pagination'] = 'true'
        response.headers['Link'] = link

    response.headers['X-OpenAPI-Paginated-Content-Key'] = 'bundle.files'
    return response
コード例 #10
0
def build_bundle_file_metadata(replica: Replica, user_supplied_files: dict):
    handle = Config.get_blobstore_handle(replica)

    time_left = nestedcontext.inject("time_left")

    # decode the list of files.
    files = [{
        'user_supplied_metadata': _file
    } for _file in user_supplied_files]

    def _get_file_metadata(_file):
        metadata_key = FileFQID(
            uuid=_file['user_supplied_metadata']['uuid'],
            version=_file['user_supplied_metadata']['version'],
        ).to_key()
        while True:
            try:
                file_metadata = handle.get(replica.bucket, metadata_key)
            except BlobNotFoundError:
                if time_left() > PUT_TIME_ALLOWANCE_SECONDS:
                    time.sleep(1)
                else:
                    break
            else:
                return json.loads(file_metadata)
        return None

    # TODO: Consider scaling parallelization with Lambda size
    with ThreadPoolExecutor(max_workers=20) as e:
        futures = {
            e.submit(_get_file_metadata, _file): _file
            for _file in files
        }
        for future in as_completed(futures):
            _file = futures[future]
            res = future.result()
            if res is not None:
                _file['file_metadata'] = res
            else:
                missing_file_user_metadata = _file['user_supplied_metadata']
                raise DSSException(
                    requests.codes.bad_request, "file_missing",
                    f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}."
                )

    return [{
        BundleFileMetadata.NAME:
        _file['user_supplied_metadata']['name'],
        BundleFileMetadata.UUID:
        _file['user_supplied_metadata']['uuid'],
        BundleFileMetadata.VERSION:
        _file['user_supplied_metadata']['version'],
        BundleFileMetadata.CONTENT_TYPE:
        _file['file_metadata'][FileMetadata.CONTENT_TYPE],
        BundleFileMetadata.SIZE:
        _file['file_metadata'][FileMetadata.SIZE],
        BundleFileMetadata.INDEXED:
        _file['user_supplied_metadata']['indexed'],
        BundleFileMetadata.CRC32C:
        _file['file_metadata'][FileMetadata.CRC32C],
        BundleFileMetadata.S3_ETAG:
        _file['file_metadata'][FileMetadata.S3_ETAG],
        BundleFileMetadata.SHA1:
        _file['file_metadata'][FileMetadata.SHA1],
        BundleFileMetadata.SHA256:
        _file['file_metadata'][FileMetadata.SHA256],
    } for _file in files]
コード例 #11
0
def put(uuid: str, json_request_body: dict, version: str = None):
    class CopyMode(Enum):
        NO_COPY = auto()
        COPY_INLINE = auto()
        COPY_ASYNC = auto()

    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    source_url = json_request_body['source_url']
    cre = re.compile("^"
                     "(?P<schema>(?:s3|gs|wasb))"
                     "://"
                     "(?P<bucket>[^/]+)"
                     "/"
                     "(?P<key>.+)"
                     "$")
    mobj = cre.match(source_url)
    if mobj and mobj.group('schema') == "s3":
        replica = Replica.aws
    elif mobj and mobj.group('schema') == "gs":
        replica = Replica.gcp
    else:
        schema = mobj.group('schema')
        raise DSSException(requests.codes.bad_request, "unknown_source_schema",
                           f"source_url schema {schema} not supported")

    handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)
    dst_bucket = replica.bucket

    src_bucket = mobj.group('bucket')
    src_key = mobj.group('key')

    metadata = handle.get_user_metadata(src_bucket, src_key)
    size = handle.get_size(src_bucket, src_key)
    content_type = handle.get_content_type(src_bucket, src_key)

    # format all the checksums so they're lower-case.
    for metadata_spec in HCABlobStore.MANDATORY_METADATA.values():
        if metadata_spec['downcase']:
            keyname = typing.cast(str, metadata_spec['keyname'])
            metadata[keyname] = metadata[keyname].lower()

    # what's the target object name for the actual data?
    dst_key = ("blobs/" + ".".join((
        metadata['hca-dss-sha256'],
        metadata['hca-dss-sha1'],
        metadata['hca-dss-s3_etag'],
        metadata['hca-dss-crc32c'],
    ))).lower()

    # does it exist? if so, we can skip the copy part.
    copy_mode = CopyMode.COPY_INLINE
    try:
        if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata):
            copy_mode = CopyMode.NO_COPY
    except BlobNotFoundError:
        pass

    # build the json document for the file metadata.
    file_metadata = {
        FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION,
        FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'],
        FileMetadata.CREATOR_UID: json_request_body['creator_uid'],
        FileMetadata.VERSION: version,
        FileMetadata.CONTENT_TYPE: content_type,
        FileMetadata.SIZE: size,
        FileMetadata.CRC32C: metadata['hca-dss-crc32c'],
        FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'],
        FileMetadata.SHA1: metadata['hca-dss-sha1'],
        FileMetadata.SHA256: metadata['hca-dss-sha256'],
    }
    file_metadata_json = json.dumps(file_metadata)

    if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD:
        copy_mode = CopyMode.COPY_ASYNC

    if copy_mode == CopyMode.COPY_ASYNC:
        if replica == Replica.aws:
            state = s3copyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}"
        elif replica == Replica.gcp:
            state = gscopyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}"
        else:
            raise ValueError("Unhandled replica")

        execution_id = str(uuid4())
        stepfunctions.step_functions_invoke(state_machine_name_template,
                                            execution_id, state)
        return jsonify(dict(task_id=execution_id,
                            version=version)), requests.codes.accepted
    elif copy_mode == CopyMode.COPY_INLINE:
        handle.copy(src_bucket, src_key, dst_bucket, dst_key)

        # verify the copy was done correctly.
        assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata)

    try:
        write_file_metadata(handle, dst_bucket, uuid, version,
                            file_metadata_json)
        status_code = requests.codes.created
    except BlobAlreadyExistsError:
        # fetch the file metadata, compare it to what we have.
        existing_file_metadata = json.loads(
            handle.get(dst_bucket,
                       "files/{}.{}".format(uuid, version)).decode("utf-8"))
        if existing_file_metadata != file_metadata:
            raise DSSException(
                requests.codes.conflict, "file_already_exists",
                f"file with UUID {uuid} and version {version} already exists")
        status_code = requests.codes.ok

    return jsonify(dict(version=version)), status_code
コード例 #12
0
def post(json_request_body: dict,
         replica: str,
         per_page: int,
         output_format: str,
         _scroll_id: typing.Optional[str] = None) -> dict:
    es_query = json_request_body['es_query']
    per_page = PerPageBounds.check(per_page)

    replica_enum = Replica[replica] if replica is not None else Replica.aws

    logger.debug(
        "Received POST for replica=%s, es_query=%s, per_page=%i, _scroll_id: %s",
        replica_enum.name, json.dumps(es_query,
                                      indent=4), per_page, _scroll_id)

    # TODO: (tsmith12) determine if a search operation timeout limit is needed
    # TODO: (tsmith12) allow users to retrieve previous search results
    # TODO: (tsmith12) if page returns 0 hits, then all results have been found. delete search id
    try:
        page = _es_search_page(es_query, replica_enum, per_page, _scroll_id,
                               output_format)
        request_dict = _format_request_body(page, es_query, replica_enum,
                                            output_format)
        request_body = jsonify(request_dict)

        if len(request_dict['results']) < per_page:
            response = make_response(request_body, requests.codes.ok)
        else:
            response = make_response(request_body, requests.codes.partial)
            next_url = _build_scroll_url(page['_scroll_id'], per_page,
                                         replica_enum, output_format)
            response.headers['Link'] = _build_link_header(
                {next_url: {
                    "rel": "next"
                }})
        return response
    except TransportError as ex:
        if ex.status_code == requests.codes.bad_request:
            logger.debug(f"Invalid Query Recieved. Exception: {ex}")
            raise DSSException(
                requests.codes.bad_request, "elasticsearch_bad_request",
                f"Invalid Elasticsearch query was received: {str(ex)}")
        elif ex.status_code == requests.codes.not_found:
            logger.debug(f"Search Context Error. Exception: {ex}")
            raise DSSException(
                requests.codes.not_found, "elasticsearch_context_not_found",
                "Elasticsearch context has returned all results or timeout has expired."
            )
        elif ex.status_code == 'N/A':
            logger.error(f"Elasticsearch Invalid Endpoint. Exception: {ex}")
            raise DSSException(
                requests.codes.service_unavailable, "service_unavailable",
                "Elasticsearch reached an invalid endpoint. Try again later.")
        else:
            logger.error(
                f"Elasticsearch Internal Server Error. Exception: {ex}")
            raise DSSException(requests.codes.internal_server_error,
                               "internal_server_error",
                               "Elasticsearch Internal Server Error")

    except ElasticsearchException as ex:
        logger.error(f"Elasticsearch Internal Server Error. Exception: {ex}")
        raise DSSException(requests.codes.internal_server_error,
                           "internal_server_error",
                           "Elasticsearch Internal Server Error")
コード例 #13
0
def put(uuid: str, replica: str, json_request_body: dict, version: str = None):
    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    handle = Config.get_blobstore_handle(Replica[replica])
    bucket = Replica[replica].bucket

    # what's the target object name for the bundle manifest?
    bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key()

    # decode the list of files.
    files = [{'user_supplied_metadata': file} for file in json_request_body['files']]

    time_left = nestedcontext.inject("time_left")

    while True:  # each time through the outer while-loop, we try to gather up all the file metadata.
        for file in files:
            user_supplied_metadata = file['user_supplied_metadata']
            metadata_key = FileFQID(
                uuid=user_supplied_metadata['uuid'],
                version=user_supplied_metadata['version'],
            ).to_key()
            if 'file_metadata' not in file:
                try:
                    file_metadata = handle.get(bucket, metadata_key)
                except BlobNotFoundError:
                    continue
                file['file_metadata'] = json.loads(file_metadata)
                if uuid != file['file_metadata']['bundle_uuid']:
                    raise DSSException(
                        requests.codes.conflict,
                        "incorrect_file_bundle_uuid",
                        f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}"
                    )

        # check to see if any file metadata is still not yet loaded.
        for file in files:
            if 'file_metadata' not in file:
                missing_file_user_metadata = file['user_supplied_metadata']
                break
        else:
            break

        # if we're out of time, give up.
        if time_left() > PUT_TIME_ALLOWANCE_SECONDS:
            time.sleep(1)
            continue

        raise DSSException(
            requests.codes.conflict,
            "file_missing",
            f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}."
        )

    # build a manifest consisting of all the files.
    bundle_metadata = {
        BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION,
        BundleMetadata.VERSION: version,
        BundleMetadata.FILES: [
            {
                BundleFileMetadata.NAME: file['user_supplied_metadata']['name'],
                BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'],
                BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'],
                BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE],
                BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE],
                BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'],
                BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C],
                BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG],
                BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1],
                BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256],
            }
            for file in files
        ],
        BundleMetadata.CREATOR_UID: json_request_body['creator_uid'],
    }

    created, idempotent = _idempotent_save(
        handle,
        bucket,
        bundle_manifest_key,
        bundle_metadata,
    )

    if not idempotent:
        raise DSSException(
            requests.codes.conflict,
            "bundle_already_exists",
            f"bundle with UUID {uuid} and version {version} already exists"
        )
    status_code = requests.codes.created if created else requests.codes.ok

    return jsonify(dict(version=version)), status_code
コード例 #14
0
def get_bundle_from_bucket(uuid: str,
                           replica: Replica,
                           version: typing.Optional[str],
                           bucket: typing.Optional[str],
                           directurls: bool = False):
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            TombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        raise DSSException(404, "not_found", "EMPTY Cannot find file!")

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_metadata = json.loads(
            handle.get(
                bucket,
                bundle_fqid.to_key(),
            ).decode("utf-8"))
    except BlobNotFoundError:
        raise DSSException(404, "not_found", "Cannot find file!")

    filesresponse = []  # type: typing.List[dict]
    for file in bundle_metadata[BundleMetadata.FILES]:
        file_version = {
            'name': file[BundleFileMetadata.NAME],
            'content-type': file[BundleFileMetadata.CONTENT_TYPE],
            'size': file[BundleFileMetadata.SIZE],
            'uuid': file[BundleFileMetadata.UUID],
            'version': file[BundleFileMetadata.VERSION],
            'crc32c': file[BundleFileMetadata.CRC32C],
            's3_etag': file[BundleFileMetadata.S3_ETAG],
            'sha1': file[BundleFileMetadata.SHA1],
            'sha256': file[BundleFileMetadata.SHA256],
            'indexed': file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=replica.storage_schema,
                netloc=bucket,
                path="blobs/{}.{}.{}.{}".format(
                    file[BundleFileMetadata.SHA256],
                    file[BundleFileMetadata.SHA1],
                    file[BundleFileMetadata.S3_ETAG],
                    file[BundleFileMetadata.CRC32C],
                ),
            ))
        filesresponse.append(file_version)

    return dict(bundle=dict(
        uuid=uuid,
        version=version,
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))
コード例 #15
0
ファイル: files.py プロジェクト: HumanCellAtlas/data-store
def get_helper(uuid: str, replica: Replica, version: str = None, token: str = None, directurl: bool = False,
               content_disposition: str = None):

    with tracing.Subsegment('parameterization'):
        handle = Config.get_blobstore_handle(replica)
        bucket = replica.bucket

    if version is None:
        with tracing.Subsegment('find_latest_version'):
            # list the files and find the one that is the most recent.
            prefix = "files/{}.".format(uuid)
            for matching_file in handle.list(bucket, prefix):
                matching_file = matching_file[len(prefix):]
                if version is None or matching_file > version:
                    version = matching_file
    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    # retrieve the file metadata.
    try:
        with tracing.Subsegment('load_file'):
            file_metadata = json.loads(
                handle.get(
                    bucket,
                    f"files/{uuid}.{version}"
                ).decode("utf-8"))
    except BlobNotFoundError:
        key = f"files/{uuid}.{version}"
        item = AsyncStateItem.get(key)
        if isinstance(item, S3CopyEtagError):
            raise DSSException(
                requests.codes.unprocessable,
                "missing_checksum",
                "Incorrect s3-etag"
            )
        elif isinstance(item, AsyncStateError):
            raise item
        else:
            raise DSSException(404, "not_found", "Cannot find file!")

    with tracing.Subsegment('make_path'):
        blob_path = compose_blob_key(file_metadata)

    if request.method == "GET":
        token, ready = _verify_checkout(replica, token, file_metadata, blob_path)
        if ready:
            if directurl:
                response = redirect(str(UrlBuilder().set(
                    scheme=replica.storage_schema,
                    netloc=replica.checkout_bucket,
                    path=get_dst_key(blob_path)
                )))
            else:
                if content_disposition:
                    # can tell a browser to treat the response link as a download rather than open a new tab
                    response = redirect(handle.generate_presigned_GET_url(
                                        replica.checkout_bucket,
                                        get_dst_key(blob_path),
                                        response_content_disposition=content_disposition))
                else:
                    response = redirect(handle.generate_presigned_GET_url(
                                        replica.checkout_bucket,
                                        get_dst_key(blob_path)))
        else:
            with tracing.Subsegment('make_retry'):
                builder = UrlBuilder(request.url)
                builder.replace_query("token", token)
                response = redirect(str(builder), code=301)
                headers = response.headers
                headers['Retry-After'] = RETRY_AFTER_INTERVAL
                return response

    else:
        response = make_response('', 200)

    with tracing.Subsegment('set_headers'):
        headers = response.headers
        headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID]
        headers['X-DSS-VERSION'] = version
        headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE]
        headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE]
        headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C]
        headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG]
        headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1]
        headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256]

    return response