def _verify_checkout( replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str, ) -> typing.Tuple[str, bool]: cloud_handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) try: now = datetime.datetime.now(datetime.timezone.utc) creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path) stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])) expiration_date = (creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS'])) - datetime.timedelta(hours=1)) if now < expiration_date: if now > stale_after_date: start_file_checkout(replica, blob_path) if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket, blob_path, file_metadata): return "", True else: logger.error( f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}") except BlobNotFoundError: pass decoded_token: dict if token is None: execution_id = start_file_checkout(replica, blob_path) start_time = time.time() attempts = 0 decoded_token = { CheckoutTokenKeys.EXECUTION_ID: execution_id, CheckoutTokenKeys.START_TIME: start_time, CheckoutTokenKeys.ATTEMPTS: attempts } else: try: decoded_token = json.loads(token) decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1 except (KeyError, ValueError) as ex: raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex) encoded_token = json.dumps(decoded_token) return encoded_token, False
def put(uuid: str, json_request_body: dict, version: str = None): class CopyMode(Enum): NO_COPY = auto() COPY_INLINE = auto() COPY_ASYNC = auto() uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) source_url = json_request_body['source_url'] cre = re.compile("^" "(?P<schema>(?:s3|gs|wasb))" "://" "(?P<bucket>[^/]+)" "/" "(?P<key>.+)" "$") mobj = cre.match(source_url) if mobj and mobj.group('schema') == "s3": replica = Replica.aws elif mobj and mobj.group('schema') == "gs": replica = Replica.gcp else: schema = mobj.group('schema') raise DSSException(requests.codes.bad_request, "unknown_source_schema", f"source_url schema {schema} not supported") handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) dst_bucket = replica.bucket src_bucket = mobj.group('bucket') src_key = mobj.group('key') metadata = handle.get_user_metadata(src_bucket, src_key) size = handle.get_size(src_bucket, src_key) content_type = handle.get_content_type(src_bucket, src_key) # format all the checksums so they're lower-case. for metadata_spec in HCABlobStore.MANDATORY_METADATA.values(): if metadata_spec['downcase']: keyname = typing.cast(str, metadata_spec['keyname']) metadata[keyname] = metadata[keyname].lower() # what's the target object name for the actual data? dst_key = ("blobs/" + ".".join(( metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'], ))).lower() # does it exist? if so, we can skip the copy part. copy_mode = CopyMode.COPY_INLINE try: if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata): copy_mode = CopyMode.NO_COPY except BlobNotFoundError: pass # build the json document for the file metadata. file_metadata = { FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION, FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'], FileMetadata.CREATOR_UID: json_request_body['creator_uid'], FileMetadata.VERSION: version, FileMetadata.CONTENT_TYPE: content_type, FileMetadata.SIZE: size, FileMetadata.CRC32C: metadata['hca-dss-crc32c'], FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'], FileMetadata.SHA1: metadata['hca-dss-sha1'], FileMetadata.SHA256: metadata['hca-dss-sha256'], } file_metadata_json = json.dumps(file_metadata) if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD: copy_mode = CopyMode.COPY_ASYNC if copy_mode == CopyMode.COPY_ASYNC: if replica == Replica.aws: state = s3copyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}" else: raise ValueError("Unhandled replica") execution_id = str(uuid4()) stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return jsonify(dict(task_id=execution_id, version=version)), requests.codes.accepted elif copy_mode == CopyMode.COPY_INLINE: handle.copy(src_bucket, src_key, dst_bucket, dst_key) # verify the copy was done correctly. assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata) try: write_file_metadata(handle, dst_bucket, uuid, version, file_metadata_json) status_code = requests.codes.created except BlobAlreadyExistsError: # fetch the file metadata, compare it to what we have. existing_file_metadata = json.loads( handle.get(dst_bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) if existing_file_metadata != file_metadata: raise DSSException( requests.codes.conflict, "file_already_exists", f"file with UUID {uuid} and version {version} already exists") status_code = requests.codes.ok return jsonify(dict(version=version)), status_code