def get_helper(uuid: str, replica: Replica, version: str = None): handle = Config.get_blobstore_handle(replica) bucket = replica.bucket if version is None: # list the files and find the one that is the most recent. prefix = "files/{}.".format(uuid) for matching_file in handle.list(bucket, prefix): matching_file = matching_file[len(prefix):] if version is None or matching_file > version: version = matching_file if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") # retrieve the file metadata. try: file_metadata = json.loads( handle.get(bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) except BlobNotFoundError as ex: raise DSSException(404, "not_found", "Cannot find file!") blob_path = "blobs/" + ".".join(( file_metadata[FileMetadata.SHA256], file_metadata[FileMetadata.SHA1], file_metadata[FileMetadata.S3_ETAG], file_metadata[FileMetadata.CRC32C], )) if request.method == "GET": """ Probabilistically return "Retry-After" header The retry-after interval can be relatively short now, but it sets up downstream libraries / users for success when we start integrating this with the checkout service. """ if random.randint(0, 100) < REDIRECT_PROBABILITY_PERCENTS: response = redirect(request.url, code=301) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response response = redirect( handle.generate_presigned_GET_url(bucket, blob_path)) else: response = make_response('', 200) headers = response.headers headers['X-DSS-BUNDLE-UUID'] = file_metadata[FileMetadata.BUNDLE_UUID] headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID] headers['X-DSS-VERSION'] = version headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE] headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE] headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C] headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG] headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1] headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256] return response
def delete(uuid: str, replica: str, json_request_body: dict, version: str = None): email = request.token_info['email'] if email not in ADMIN_USER_EMAILS: raise DSSException( requests.codes.forbidden, "forbidden", f"You can't delete bundles with these credentials!", ) uuid = uuid.lower() version = datetime_to_version_format( iso8601.parse_date(version)) if version else None tombstone_id = TombstoneID(uuid=uuid, version=version) bundle_prefix = tombstone_id.to_key_prefix() tombstone_object_data = _create_tombstone_data( email=email, reason=json_request_body.get('reason'), version=version, ) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket if test_object_exists(handle, bucket, bundle_prefix, test_type=ObjectTest.PREFIX): created, idempotent = _idempotent_save(handle, bucket, tombstone_id.to_key(), tombstone_object_data) if not idempotent: raise DSSException( requests.codes.conflict, f"bundle_tombstone_already_exists", f"bundle tombstone with UUID {uuid} and version {version} already exists", ) status_code = requests.codes.ok response_body = dict() # type: dict else: status_code = requests.codes.not_found response_body = dict(title="bundle not found") return jsonify(response_body), status_code
def patch(uuid: str, json_request_body: dict, replica: str, version: str): bundle = get_bundle_manifest(uuid, Replica[replica], version) if bundle is None: raise DSSException(404, "not_found", "Could not find bundle for UUID {}".format(uuid)) remove_files_set = { bundle_file_id_metadata(f) for f in json_request_body.get("remove_files", []) } bundle['files'] = [ f for f in bundle['files'] if bundle_file_id_metadata(f) not in remove_files_set ] add_files = json_request_body.get("add_files", []) bundle['files'].extend( build_bundle_file_metadata(Replica[replica], add_files)) detect_filename_collisions(bundle['files']) timestamp = datetime.datetime.utcnow() new_bundle_version = datetime_to_version_format(timestamp) bundle['version'] = new_bundle_version _save_bundle(Replica[replica], uuid, new_bundle_version, bundle) return jsonify(dict(uuid=uuid, version=new_bundle_version)), requests.codes.ok
def _save_bundle(replica: Replica, uuid: str, version: str, bundle_metadata: dict) -> int: try: created, idempotent = save_bundle_manifest(replica, uuid, version, bundle_metadata) except BlobStoreTimeoutError: raise DSSException( requests.codes.unavailable, "service_unavailable", f"Service unavailable due to unusually high load/latency") if not idempotent: raise DSSException( requests.codes.conflict, "bundle_already_exists", f"bundle with UUID {uuid} and version {version} already exists") status_code = requests.codes.created if created else requests.codes.ok return status_code
def detect_filename_collisions(bundle_file_metadata): filenames: typing.Set[str] = set() for _file in bundle_file_metadata: name = _file[BundleFileMetadata.NAME] if name not in filenames: filenames.add(name) else: raise DSSException( requests.codes.bad_request, "duplicate_filename", f"Duplicate file name detected: {name}. This test fails on the first occurance. Please check bundle " "layout to ensure no duplicated file names are present.")
def validate(definitions: Definitions) -> None: """ Validate the given attachment definitions. This should be called in a request handling context as it raises DSSException referring to HTTP status code, as well as error code and description. """ for name, definition in definitions.items(): if name.startswith('_'): raise DSSException( requests.codes.bad_request, "invalid_attachment_name", f"Attachment names must not start with underscore ({name})") type_ = definition['type'] if type_ == 'jmespath': expression = definition['expression'] try: jmespath.compile(expression) except JMESPathError as e: raise DSSException( requests.codes.bad_request, "invalid_attachment_expression", f"Unable to compile JMESPath expression for attachment {name}" ) from e else: assert False, type_
def delete(uuid: str, replica: str, json_request_body: dict, version: str = None): email = security.get_token_email(request.token_info) if email not in ADMIN_USER_EMAILS: raise DSSForbiddenException( "You can't delete bundles with these credentials!") uuid = uuid.lower() tombstone_id = BundleTombstoneID(uuid=uuid, version=version) bundle_prefix = tombstone_id.to_key_prefix() tombstone_object_data = _create_tombstone_data( email=email, reason=json_request_body.get('reason'), version=version, ) handle = Config.get_blobstore_handle(Replica[replica]) if not test_object_exists(handle, Replica[replica].bucket, bundle_prefix, test_type=ObjectTest.PREFIX): raise DSSException(404, "not_found", "Cannot find bundle!") created, idempotent = idempotent_save( handle, Replica[replica].bucket, tombstone_id.to_key(), json.dumps(tombstone_object_data).encode("utf-8")) if not idempotent: raise DSSException( requests.codes.conflict, f"bundle_tombstone_already_exists", f"bundle tombstone with UUID {uuid} and version {version} already exists", ) return dict(), requests.codes.ok
def _verify_checkout( replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str, ) -> typing.Tuple[str, bool]: cloud_handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) try: now = datetime.datetime.now(datetime.timezone.utc) creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path) stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])) expiration_date = (creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS'])) - datetime.timedelta(hours=1)) if now < expiration_date: if now > stale_after_date: start_file_checkout(replica, blob_path) if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket, blob_path, file_metadata): return "", True else: logger.error( f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}") except BlobNotFoundError: pass decoded_token: dict if token is None: execution_id = start_file_checkout(replica, blob_path) start_time = time.time() attempts = 0 decoded_token = { CheckoutTokenKeys.EXECUTION_ID: execution_id, CheckoutTokenKeys.START_TIME: start_time, CheckoutTokenKeys.ATTEMPTS: attempts } else: try: decoded_token = json.loads(token) decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1 except (KeyError, ValueError) as ex: raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex) encoded_token = json.dumps(decoded_token) return encoded_token, False
def get( uuid: str, replica: str, per_page: int, version: str = None, directurls: bool = False, presignedurls: bool = False, token: str = None, start_at: int = 0, ): if directurls and presignedurls: raise DSSException( requests.codes.bad_request, "only_one_urltype", "only enable one of `directurls` or `presignedurls`") _replica = Replica[replica] bundle_metadata = get_bundle_manifest(uuid, _replica, version) if bundle_metadata is None: raise DSSException(404, "not_found", "Cannot find bundle!") if version is None: version = bundle_metadata[BundleMetadata.VERSION] if directurls or presignedurls: try: token, ready = verify_checkout(_replica, uuid, version, token) except TokenError as ex: raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex) except CheckoutError as ex: raise DSSException(requests.codes.server_error, "checkout_error", "Could not complete checkout", ex) if not ready: builder = UrlBuilder(request.url) builder.replace_query("token", token) response = redirect(str(builder), code=requests.codes.moved) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response all_files = bundle_metadata[BundleMetadata.FILES] link = None if len(all_files) - start_at > per_page: next_url = UrlBuilder(request.url) next_url.replace_query("start_at", str(start_at + per_page)) next_url.replace_query("version", version) next_url.replace_query("token", token) link = f"<{next_url}>; rel='next'" files = all_files[start_at:start_at + per_page] filesresponse = [] # type: typing.List[dict] for _file in files: file_version = { 'name': _file[BundleFileMetadata.NAME], 'content-type': _file[BundleFileMetadata.CONTENT_TYPE], 'size': _file[BundleFileMetadata.SIZE], 'uuid': _file[BundleFileMetadata.UUID], 'version': _file[BundleFileMetadata.VERSION], 'crc32c': _file[BundleFileMetadata.CRC32C], 's3_etag': _file[BundleFileMetadata.S3_ETAG], 'sha1': _file[BundleFileMetadata.SHA1], 'sha256': _file[BundleFileMetadata.SHA256], 'indexed': _file[BundleFileMetadata.INDEXED], } if directurls: file_version['url'] = str(UrlBuilder().set( scheme=_replica.storage_schema, netloc=_replica.checkout_bucket, path="{}/{}".format( get_dst_bundle_prefix( uuid, bundle_metadata[BundleMetadata.VERSION]), _file[BundleFileMetadata.NAME], ), )) elif presignedurls: handle = Config.get_blobstore_handle(_replica) file_version['url'] = handle.generate_presigned_GET_url( _replica.checkout_bucket, "{}/{}".format( get_dst_bundle_prefix( uuid, bundle_metadata[BundleMetadata.VERSION]), _file[BundleFileMetadata.NAME], ), ) filesresponse.append(file_version) response_body = dict(bundle=dict( uuid=uuid, version=bundle_metadata[BundleMetadata.VERSION], files=filesresponse, creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID], )) if link is None: response = make_response(jsonify(response_body), requests.codes.ok) response.headers['X-OpenAPI-Pagination'] = 'false' else: response = make_response(jsonify(response_body), requests.codes.partial) response.headers['X-OpenAPI-Pagination'] = 'true' response.headers['Link'] = link response.headers['X-OpenAPI-Paginated-Content-Key'] = 'bundle.files' return response
def build_bundle_file_metadata(replica: Replica, user_supplied_files: dict): handle = Config.get_blobstore_handle(replica) time_left = nestedcontext.inject("time_left") # decode the list of files. files = [{ 'user_supplied_metadata': _file } for _file in user_supplied_files] def _get_file_metadata(_file): metadata_key = FileFQID( uuid=_file['user_supplied_metadata']['uuid'], version=_file['user_supplied_metadata']['version'], ).to_key() while True: try: file_metadata = handle.get(replica.bucket, metadata_key) except BlobNotFoundError: if time_left() > PUT_TIME_ALLOWANCE_SECONDS: time.sleep(1) else: break else: return json.loads(file_metadata) return None # TODO: Consider scaling parallelization with Lambda size with ThreadPoolExecutor(max_workers=20) as e: futures = { e.submit(_get_file_metadata, _file): _file for _file in files } for future in as_completed(futures): _file = futures[future] res = future.result() if res is not None: _file['file_metadata'] = res else: missing_file_user_metadata = _file['user_supplied_metadata'] raise DSSException( requests.codes.bad_request, "file_missing", f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}." ) return [{ BundleFileMetadata.NAME: _file['user_supplied_metadata']['name'], BundleFileMetadata.UUID: _file['user_supplied_metadata']['uuid'], BundleFileMetadata.VERSION: _file['user_supplied_metadata']['version'], BundleFileMetadata.CONTENT_TYPE: _file['file_metadata'][FileMetadata.CONTENT_TYPE], BundleFileMetadata.SIZE: _file['file_metadata'][FileMetadata.SIZE], BundleFileMetadata.INDEXED: _file['user_supplied_metadata']['indexed'], BundleFileMetadata.CRC32C: _file['file_metadata'][FileMetadata.CRC32C], BundleFileMetadata.S3_ETAG: _file['file_metadata'][FileMetadata.S3_ETAG], BundleFileMetadata.SHA1: _file['file_metadata'][FileMetadata.SHA1], BundleFileMetadata.SHA256: _file['file_metadata'][FileMetadata.SHA256], } for _file in files]
def put(uuid: str, json_request_body: dict, version: str = None): class CopyMode(Enum): NO_COPY = auto() COPY_INLINE = auto() COPY_ASYNC = auto() uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) source_url = json_request_body['source_url'] cre = re.compile("^" "(?P<schema>(?:s3|gs|wasb))" "://" "(?P<bucket>[^/]+)" "/" "(?P<key>.+)" "$") mobj = cre.match(source_url) if mobj and mobj.group('schema') == "s3": replica = Replica.aws elif mobj and mobj.group('schema') == "gs": replica = Replica.gcp else: schema = mobj.group('schema') raise DSSException(requests.codes.bad_request, "unknown_source_schema", f"source_url schema {schema} not supported") handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) dst_bucket = replica.bucket src_bucket = mobj.group('bucket') src_key = mobj.group('key') metadata = handle.get_user_metadata(src_bucket, src_key) size = handle.get_size(src_bucket, src_key) content_type = handle.get_content_type(src_bucket, src_key) # format all the checksums so they're lower-case. for metadata_spec in HCABlobStore.MANDATORY_METADATA.values(): if metadata_spec['downcase']: keyname = typing.cast(str, metadata_spec['keyname']) metadata[keyname] = metadata[keyname].lower() # what's the target object name for the actual data? dst_key = ("blobs/" + ".".join(( metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'], ))).lower() # does it exist? if so, we can skip the copy part. copy_mode = CopyMode.COPY_INLINE try: if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata): copy_mode = CopyMode.NO_COPY except BlobNotFoundError: pass # build the json document for the file metadata. file_metadata = { FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION, FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'], FileMetadata.CREATOR_UID: json_request_body['creator_uid'], FileMetadata.VERSION: version, FileMetadata.CONTENT_TYPE: content_type, FileMetadata.SIZE: size, FileMetadata.CRC32C: metadata['hca-dss-crc32c'], FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'], FileMetadata.SHA1: metadata['hca-dss-sha1'], FileMetadata.SHA256: metadata['hca-dss-sha256'], } file_metadata_json = json.dumps(file_metadata) if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD: copy_mode = CopyMode.COPY_ASYNC if copy_mode == CopyMode.COPY_ASYNC: if replica == Replica.aws: state = s3copyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}" else: raise ValueError("Unhandled replica") execution_id = str(uuid4()) stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return jsonify(dict(task_id=execution_id, version=version)), requests.codes.accepted elif copy_mode == CopyMode.COPY_INLINE: handle.copy(src_bucket, src_key, dst_bucket, dst_key) # verify the copy was done correctly. assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata) try: write_file_metadata(handle, dst_bucket, uuid, version, file_metadata_json) status_code = requests.codes.created except BlobAlreadyExistsError: # fetch the file metadata, compare it to what we have. existing_file_metadata = json.loads( handle.get(dst_bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) if existing_file_metadata != file_metadata: raise DSSException( requests.codes.conflict, "file_already_exists", f"file with UUID {uuid} and version {version} already exists") status_code = requests.codes.ok return jsonify(dict(version=version)), status_code
def post(json_request_body: dict, replica: str, per_page: int, output_format: str, _scroll_id: typing.Optional[str] = None) -> dict: es_query = json_request_body['es_query'] per_page = PerPageBounds.check(per_page) replica_enum = Replica[replica] if replica is not None else Replica.aws logger.debug( "Received POST for replica=%s, es_query=%s, per_page=%i, _scroll_id: %s", replica_enum.name, json.dumps(es_query, indent=4), per_page, _scroll_id) # TODO: (tsmith12) determine if a search operation timeout limit is needed # TODO: (tsmith12) allow users to retrieve previous search results # TODO: (tsmith12) if page returns 0 hits, then all results have been found. delete search id try: page = _es_search_page(es_query, replica_enum, per_page, _scroll_id, output_format) request_dict = _format_request_body(page, es_query, replica_enum, output_format) request_body = jsonify(request_dict) if len(request_dict['results']) < per_page: response = make_response(request_body, requests.codes.ok) else: response = make_response(request_body, requests.codes.partial) next_url = _build_scroll_url(page['_scroll_id'], per_page, replica_enum, output_format) response.headers['Link'] = _build_link_header( {next_url: { "rel": "next" }}) return response except TransportError as ex: if ex.status_code == requests.codes.bad_request: logger.debug(f"Invalid Query Recieved. Exception: {ex}") raise DSSException( requests.codes.bad_request, "elasticsearch_bad_request", f"Invalid Elasticsearch query was received: {str(ex)}") elif ex.status_code == requests.codes.not_found: logger.debug(f"Search Context Error. Exception: {ex}") raise DSSException( requests.codes.not_found, "elasticsearch_context_not_found", "Elasticsearch context has returned all results or timeout has expired." ) elif ex.status_code == 'N/A': logger.error(f"Elasticsearch Invalid Endpoint. Exception: {ex}") raise DSSException( requests.codes.service_unavailable, "service_unavailable", "Elasticsearch reached an invalid endpoint. Try again later.") else: logger.error( f"Elasticsearch Internal Server Error. Exception: {ex}") raise DSSException(requests.codes.internal_server_error, "internal_server_error", "Elasticsearch Internal Server Error") except ElasticsearchException as ex: logger.error(f"Elasticsearch Internal Server Error. Exception: {ex}") raise DSSException(requests.codes.internal_server_error, "internal_server_error", "Elasticsearch Internal Server Error")
def put(uuid: str, replica: str, json_request_body: dict, version: str = None): uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket # what's the target object name for the bundle manifest? bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key() # decode the list of files. files = [{'user_supplied_metadata': file} for file in json_request_body['files']] time_left = nestedcontext.inject("time_left") while True: # each time through the outer while-loop, we try to gather up all the file metadata. for file in files: user_supplied_metadata = file['user_supplied_metadata'] metadata_key = FileFQID( uuid=user_supplied_metadata['uuid'], version=user_supplied_metadata['version'], ).to_key() if 'file_metadata' not in file: try: file_metadata = handle.get(bucket, metadata_key) except BlobNotFoundError: continue file['file_metadata'] = json.loads(file_metadata) if uuid != file['file_metadata']['bundle_uuid']: raise DSSException( requests.codes.conflict, "incorrect_file_bundle_uuid", f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}" ) # check to see if any file metadata is still not yet loaded. for file in files: if 'file_metadata' not in file: missing_file_user_metadata = file['user_supplied_metadata'] break else: break # if we're out of time, give up. if time_left() > PUT_TIME_ALLOWANCE_SECONDS: time.sleep(1) continue raise DSSException( requests.codes.conflict, "file_missing", f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}." ) # build a manifest consisting of all the files. bundle_metadata = { BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION, BundleMetadata.VERSION: version, BundleMetadata.FILES: [ { BundleFileMetadata.NAME: file['user_supplied_metadata']['name'], BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'], BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'], BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE], BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE], BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'], BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C], BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG], BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1], BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256], } for file in files ], BundleMetadata.CREATOR_UID: json_request_body['creator_uid'], } created, idempotent = _idempotent_save( handle, bucket, bundle_manifest_key, bundle_metadata, ) if not idempotent: raise DSSException( requests.codes.conflict, "bundle_already_exists", f"bundle with UUID {uuid} and version {version} already exists" ) status_code = requests.codes.created if created else requests.codes.ok return jsonify(dict(version=version)), status_code
def get_bundle_from_bucket(uuid: str, replica: Replica, version: typing.Optional[str], bucket: typing.Optional[str], directurls: bool = False): uuid = uuid.lower() handle = Config.get_blobstore_handle(replica) default_bucket = replica.bucket # need the ability to use fixture bucket for testing bucket = default_bucket if bucket is None else bucket def tombstone_exists(uuid: str, version: typing.Optional[str]): return test_object_exists( handle, bucket, TombstoneID(uuid=uuid, version=version).to_key()) # handle the following deletion cases # 1. the whole bundle is deleted # 2. the specific version of the bundle is deleted if tombstone_exists(uuid, None) or (version and tombstone_exists(uuid, version)): raise DSSException(404, "not_found", "EMPTY Cannot find file!") # handle the following deletion case # 3. no version is specified, we want the latest _non-deleted_ version if version is None: # list the files and find the one that is the most recent. prefix = f"bundles/{uuid}." object_names = handle.list(bucket, prefix) version = _latest_version_from_object_names(object_names) if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") bundle_fqid = BundleFQID(uuid=uuid, version=version) # retrieve the bundle metadata. try: bundle_metadata = json.loads( handle.get( bucket, bundle_fqid.to_key(), ).decode("utf-8")) except BlobNotFoundError: raise DSSException(404, "not_found", "Cannot find file!") filesresponse = [] # type: typing.List[dict] for file in bundle_metadata[BundleMetadata.FILES]: file_version = { 'name': file[BundleFileMetadata.NAME], 'content-type': file[BundleFileMetadata.CONTENT_TYPE], 'size': file[BundleFileMetadata.SIZE], 'uuid': file[BundleFileMetadata.UUID], 'version': file[BundleFileMetadata.VERSION], 'crc32c': file[BundleFileMetadata.CRC32C], 's3_etag': file[BundleFileMetadata.S3_ETAG], 'sha1': file[BundleFileMetadata.SHA1], 'sha256': file[BundleFileMetadata.SHA256], 'indexed': file[BundleFileMetadata.INDEXED], } if directurls: file_version['url'] = str(UrlBuilder().set( scheme=replica.storage_schema, netloc=bucket, path="blobs/{}.{}.{}.{}".format( file[BundleFileMetadata.SHA256], file[BundleFileMetadata.SHA1], file[BundleFileMetadata.S3_ETAG], file[BundleFileMetadata.CRC32C], ), )) filesresponse.append(file_version) return dict(bundle=dict( uuid=uuid, version=version, files=filesresponse, creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID], ))
def get_helper(uuid: str, replica: Replica, version: str = None, token: str = None, directurl: bool = False, content_disposition: str = None): with tracing.Subsegment('parameterization'): handle = Config.get_blobstore_handle(replica) bucket = replica.bucket if version is None: with tracing.Subsegment('find_latest_version'): # list the files and find the one that is the most recent. prefix = "files/{}.".format(uuid) for matching_file in handle.list(bucket, prefix): matching_file = matching_file[len(prefix):] if version is None or matching_file > version: version = matching_file if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") # retrieve the file metadata. try: with tracing.Subsegment('load_file'): file_metadata = json.loads( handle.get( bucket, f"files/{uuid}.{version}" ).decode("utf-8")) except BlobNotFoundError: key = f"files/{uuid}.{version}" item = AsyncStateItem.get(key) if isinstance(item, S3CopyEtagError): raise DSSException( requests.codes.unprocessable, "missing_checksum", "Incorrect s3-etag" ) elif isinstance(item, AsyncStateError): raise item else: raise DSSException(404, "not_found", "Cannot find file!") with tracing.Subsegment('make_path'): blob_path = compose_blob_key(file_metadata) if request.method == "GET": token, ready = _verify_checkout(replica, token, file_metadata, blob_path) if ready: if directurl: response = redirect(str(UrlBuilder().set( scheme=replica.storage_schema, netloc=replica.checkout_bucket, path=get_dst_key(blob_path) ))) else: if content_disposition: # can tell a browser to treat the response link as a download rather than open a new tab response = redirect(handle.generate_presigned_GET_url( replica.checkout_bucket, get_dst_key(blob_path), response_content_disposition=content_disposition)) else: response = redirect(handle.generate_presigned_GET_url( replica.checkout_bucket, get_dst_key(blob_path))) else: with tracing.Subsegment('make_retry'): builder = UrlBuilder(request.url) builder.replace_query("token", token) response = redirect(str(builder), code=301) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response else: response = make_response('', 200) with tracing.Subsegment('set_headers'): headers = response.headers headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID] headers['X-DSS-VERSION'] = version headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE] headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE] headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C] headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG] headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1] headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256] return response