def test_to_key_prefix(self): uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be" version = "2017-12-05T235728.441373Z" self.assertEquals( BundleFQID(uuid=uuid, version=version).to_key_prefix(), f"{BUNDLE_PREFIX}/{uuid}.{version}") self.assertEquals( BundleFQID(uuid=uuid, version=None).to_key_prefix(), f"{BUNDLE_PREFIX}/{uuid}.")
def _read_bundle_manifest(cls, replica: Replica, fqid: BundleFQID) -> dict: handle = Config.get_blobstore_handle(replica) bucket_name = replica.bucket manifest_string = handle.get(bucket_name, fqid.to_key()).decode("utf-8") logger.debug( "Read bundle manifest from bucket %s with bundle key %s: %s", bucket_name, fqid.to_key(), manifest_string) manifest = json.loads(manifest_string, encoding="utf-8") return manifest
def _get_bundle_manifest( uuid: str, replica: Replica, version: typing.Optional[str], *, bucket: typing.Optional[str] = None) -> typing.Optional[dict]: """ Return the contents of the bundle manifest file from cloud storage, subject to the rules of tombstoning. If version is None, return the latest version, once again, subject to the rules of tombstoning. If the bundle cannot be found, return None """ uuid = uuid.lower() handle = Config.get_blobstore_handle(replica) default_bucket = replica.bucket # need the ability to use fixture bucket for testing bucket = default_bucket if bucket is None else bucket def tombstone_exists(uuid: str, version: typing.Optional[str]): return test_object_exists( handle, bucket, BundleTombstoneID(uuid=uuid, version=version).to_key()) # handle the following deletion cases # 1. the whole bundle is deleted # 2. the specific version of the bundle is deleted if tombstone_exists(uuid, None) or (version and tombstone_exists(uuid, version)): return None # handle the following deletion case # 3. no version is specified, we want the latest _non-deleted_ version if version is None: # list the files and find the one that is the most recent. prefix = f"bundles/{uuid}." object_names = handle.list(bucket, prefix) version = _latest_version_from_object_names(object_names) if version is None: # no matches! return None bundle_fqid = BundleFQID(uuid=uuid, version=version) # retrieve the bundle metadata. try: bundle_manifest_blob = handle.get(bucket, bundle_fqid.to_key()).decode("utf-8") return json.loads(bundle_manifest_blob) except BlobNotFoundError: return None
def test_from_key(self): """ Test that the from key method correctly returns the right types of identifiers """ uuid = "ca11ab1e-0000-4a6b-8f0d-a7d2105c23be" version = "2017-12-05T235728.441373Z" self.assertEquals( BundleFQID(uuid, version), ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}"), ) self.assertEquals( FileFQID(uuid, version), ObjectIdentifier.from_key(f"{FILE_PREFIX}/{uuid}.{version}"), ) self.assertEquals( CollectionFQID(uuid, version), ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}"), ) self.assertEquals( CollectionTombstoneID(uuid, version), ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}.dead"), ) self.assertEquals( BundleTombstoneID(uuid, version), ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}.dead"), ) self.assertRaises( ValueError, lambda: ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/trash"), ) self.assertRaises( ValueError, lambda: ObjectIdentifier.from_key(f"trash/{uuid}.{version}.dead"), )
def get_deleted_bundle_metadata_document(replica: Replica, key: str) -> dict: """ Build the bundle metadata document assocated with a non-existent key. """ fqid = BundleFQID.from_key(key) return dict(event_type="DELETE", bundle_info=dict(uuid=fqid.uuid, version=fqid.version))
def save_bundle_manifest(replica: Replica, uuid: str, version: str, bundle: dict) -> typing.Tuple[bool, bool]: handle = Config.get_blobstore_handle(replica) data = json.dumps(bundle).encode("utf-8") fqid = BundleFQID(uuid, version).to_key() created, idempotent = idempotent_save(handle, replica.bucket, fqid, data) if created and idempotent: cache_key = _cache_key_template.format(replica=replica.name, fqid=fqid) _bundle_manifest_cache[cache_key] = bundle return created, idempotent
def test_tombstone_to_bundle_fqid(self): uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be" version = "2017-12-05T235728.441373Z" self.assertTrue( BundleTombstoneID(uuid=uuid, version=version).to_fqid(), BundleFQID(uuid=uuid, version=version), ) self.assertRaises( ValueError, lambda: BundleTombstoneID(uuid=uuid, version=None).to_fqid(), )
def test_to_str(self): uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be" version = "2017-12-05T235728.441373Z" self.assertEquals(str(BundleFQID(uuid=uuid, version=version)), f"{uuid}.{version}") self.assertEquals(str(FileFQID(uuid=uuid, version=version)), f"{uuid}.{version}") self.assertEquals(str(TombstoneID(uuid=uuid, version=version)), f"{uuid}.{version}.dead") self.assertEquals(str(TombstoneID(uuid=uuid, version=None)), f"{uuid}.dead")
class Iterator: keys = [BundleFQID(uuid=uuid.uuid4(), version=datetime_to_version_format(datetime.datetime.utcnow())).to_key() for i in range(10)] def __init__(self, *args, **kwargs): self.start_after_key = None self.token = 'frank' def __iter__(self): for key in self.keys: self.start_after_key = key yield self.start_after_key
def get_bundle_manifest( uuid: str, replica: Replica, version: typing.Optional[str], *, bucket: typing.Optional[str] = None) -> typing.Optional[dict]: cache_key = _cache_key_template.format(replica=replica.name, fqid=BundleFQID(uuid, version).to_key()) if cache_key in _bundle_manifest_cache: return _bundle_manifest_cache[cache_key] else: bundle = _get_bundle_manifest(uuid, replica, version, bucket=bucket) if bundle is not None: _bundle_manifest_cache[cache_key] = bundle return bundle
def __iter__(self): for key in self._keys(): fqid = BundleFQID.from_key(key) if fqid.uuid != self.bundle_info['uuid']: for bundle_fqid in self._living_fqids_in_bundle_info(): yield bundle_fqid self._init_bundle_info(fqid) else: if not fqid.is_fully_qualified(): self.bundle_info['contains_unversioned_tombstone'] = True else: self.bundle_info['fqids'][fqid] = isinstance( fqid, BundleTombstoneID) for bundle_fqid in self._living_fqids_in_bundle_info(): yield bundle_fqid
def build_bundle_metadata_document(replica: Replica, key: str) -> dict: """ This returns a JSON document with bundle manifest and metadata files suitable for JMESPath filters. """ handle = Config.get_blobstore_handle(replica) manifest = json.loads(handle.get(replica.bucket, key).decode("utf-8")) fqid = BundleFQID.from_key(key) bundle_info = dict(uuid=fqid.uuid, version=fqid.version) if key.endswith(TOMBSTONE_SUFFIX): return dict(event_type="TOMBSTONE", bundle_info=bundle_info, **manifest) else: lock = threading.Lock() files: dict = defaultdict(list) def _read_file(file_metadata): blob_key = "blobs/{}.{}.{}.{}".format( file_metadata['sha256'], file_metadata['sha1'], file_metadata['s3-etag'], file_metadata['crc32c'], ) contents = handle.get(replica.bucket, blob_key).decode("utf-8") try: file_info = json.loads(contents) except json.decoder.JSONDecodeError: logging.info(f"{file_metadata['name']} not json decodable") else: # Modify name to avoid confusion with JMESPath syntax name = _dot_to_underscore_and_strip_numeric_suffix( file_metadata['name']) with lock: files[name].append(file_info) # TODO: Consider scaling parallelization with Lambda size with ThreadPoolExecutor(max_workers=4) as e: e.map(_read_file, [ file_metadata for file_metadata in manifest['files'] if file_metadata['content-type'].startswith("application/json") ]) return dict(event_type="CREATE", bundle_info=bundle_info, manifest=manifest, files=dict(files))
def dependencies_exist(source_replica: Replica, dest_replica: Replica, key: str): """ Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in dest_replica: - Given a file manifest key, checks if blobs exist in dest_replica. - Given a bundle manifest key, checks if file manifests exist in dest_replica. - Given a collection key, checks if all collection contents exist in dest_replica. Returns true if all dependencies exist in dest_replica, false otherwise. """ source_handle = Config.get_blobstore_handle(source_replica) dest_handle = Config.get_blobstore_handle(dest_replica) if key.endswith(TOMBSTONE_SUFFIX): return True elif key.startswith(FILE_PREFIX): file_id = FileFQID.from_key(key) file_manifest = get_json_metadata( entity_type="file", uuid=file_id.uuid, version=file_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) blob_path = compose_blob_key(file_manifest) if exists(dest_replica, blob_path): return True elif key.startswith(BUNDLE_PREFIX): # head all file manifests bundle_id = BundleFQID.from_key(key) bundle_manifest = get_json_metadata( entity_type="bundle", uuid=bundle_id.uuid, version=bundle_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: with ThreadPoolExecutor(max_workers=20) as e: futures = list() for file in bundle_manifest[BundleMetadata.FILES]: file_uuid = file[BundleFileMetadata.UUID] file_version = file[BundleFileMetadata.VERSION] futures.append( e.submit(get_json_metadata, entity_type="file", uuid=file_uuid, version=file_version, replica=dest_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size)) for future in as_completed(futures): future.result() return True except Exception: pass elif key.startswith(COLLECTION_PREFIX): collection_id = CollectionFQID.from_key(key) collection_manifest = get_json_metadata( entity_type="collection", uuid=collection_id.uuid, version=collection_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: verify_collection(contents=collection_manifest["contents"], replica=dest_replica, blobstore_handle=dest_handle) return True except Exception: pass else: raise NotImplementedError("Unknown prefix for key {}".format(key)) return False
def put(uuid: str, replica: str, json_request_body: dict, version: str = None): uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket # what's the target object name for the bundle manifest? bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key() # decode the list of files. files = [{'user_supplied_metadata': file} for file in json_request_body['files']] time_left = nestedcontext.inject("time_left") while True: # each time through the outer while-loop, we try to gather up all the file metadata. for file in files: user_supplied_metadata = file['user_supplied_metadata'] metadata_key = FileFQID( uuid=user_supplied_metadata['uuid'], version=user_supplied_metadata['version'], ).to_key() if 'file_metadata' not in file: try: file_metadata = handle.get(bucket, metadata_key) except BlobNotFoundError: continue file['file_metadata'] = json.loads(file_metadata) if uuid != file['file_metadata']['bundle_uuid']: raise DSSException( requests.codes.conflict, "incorrect_file_bundle_uuid", f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}" ) # check to see if any file metadata is still not yet loaded. for file in files: if 'file_metadata' not in file: missing_file_user_metadata = file['user_supplied_metadata'] break else: break # if we're out of time, give up. if time_left() > PUT_TIME_ALLOWANCE_SECONDS: time.sleep(1) continue raise DSSException( requests.codes.conflict, "file_missing", f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}." ) # build a manifest consisting of all the files. bundle_metadata = { BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION, BundleMetadata.VERSION: version, BundleMetadata.FILES: [ { BundleFileMetadata.NAME: file['user_supplied_metadata']['name'], BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'], BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'], BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE], BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE], BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'], BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C], BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG], BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1], BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256], } for file in files ], BundleMetadata.CREATOR_UID: json_request_body['creator_uid'], } created, idempotent = _idempotent_save( handle, bucket, bundle_manifest_key, bundle_metadata, ) if not idempotent: raise DSSException( requests.codes.conflict, "bundle_already_exists", f"bundle with UUID {uuid} and version {version} already exists" ) status_code = requests.codes.created if created else requests.codes.ok return jsonify(dict(version=version)), status_code
def get_bundle_from_bucket(uuid: str, replica: Replica, version: typing.Optional[str], bucket: typing.Optional[str], directurls: bool = False): uuid = uuid.lower() handle = Config.get_blobstore_handle(replica) default_bucket = replica.bucket # need the ability to use fixture bucket for testing bucket = default_bucket if bucket is None else bucket def tombstone_exists(uuid: str, version: typing.Optional[str]): return test_object_exists( handle, bucket, TombstoneID(uuid=uuid, version=version).to_key()) # handle the following deletion cases # 1. the whole bundle is deleted # 2. the specific version of the bundle is deleted if tombstone_exists(uuid, None) or (version and tombstone_exists(uuid, version)): raise DSSException(404, "not_found", "EMPTY Cannot find file!") # handle the following deletion case # 3. no version is specified, we want the latest _non-deleted_ version if version is None: # list the files and find the one that is the most recent. prefix = f"bundles/{uuid}." object_names = handle.list(bucket, prefix) version = _latest_version_from_object_names(object_names) if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") bundle_fqid = BundleFQID(uuid=uuid, version=version) # retrieve the bundle metadata. try: bundle_metadata = json.loads( handle.get( bucket, bundle_fqid.to_key(), ).decode("utf-8")) except BlobNotFoundError: raise DSSException(404, "not_found", "Cannot find file!") filesresponse = [] # type: typing.List[dict] for file in bundle_metadata[BundleMetadata.FILES]: file_version = { 'name': file[BundleFileMetadata.NAME], 'content-type': file[BundleFileMetadata.CONTENT_TYPE], 'size': file[BundleFileMetadata.SIZE], 'uuid': file[BundleFileMetadata.UUID], 'version': file[BundleFileMetadata.VERSION], 'crc32c': file[BundleFileMetadata.CRC32C], 's3_etag': file[BundleFileMetadata.S3_ETAG], 'sha1': file[BundleFileMetadata.SHA1], 'sha256': file[BundleFileMetadata.SHA256], 'indexed': file[BundleFileMetadata.INDEXED], } if directurls: file_version['url'] = str(UrlBuilder().set( scheme=replica.storage_schema, netloc=bucket, path="blobs/{}.{}.{}.{}".format( file[BundleFileMetadata.SHA256], file[BundleFileMetadata.SHA1], file[BundleFileMetadata.S3_ETAG], file[BundleFileMetadata.CRC32C], ), )) filesresponse.append(file_version) return dict(bundle=dict( uuid=uuid, version=version, files=filesresponse, creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID], ))
def get_bundle_fqid() -> BundleFQID: return BundleFQID(uuid=str(uuid.uuid4()), version=get_version())
def test_dependencies_exist(self): file_uuid, file_version = str(uuid.uuid4()), get_version() bundle_uuid, bundle_version = str(uuid.uuid4()), get_version() collection_data = { "contents": [{ "type": "bundle", "uuid": bundle_uuid, "version": bundle_version }, { "type": "file", "uuid": file_uuid, "version": file_version }] } bundle_data = { BundleMetadata.FILES: [{ BundleFileMetadata.UUID: file_uuid, BundleFileMetadata.VERSION: file_version }] } file_data = { FileMetadata.SHA256: "sync_test", FileMetadata.SHA1: "sync_test", FileMetadata.S3_ETAG: "sync_test", FileMetadata.CRC32C: str(uuid.uuid4()) } with self.subTest("collection without deps"): collection_key = "{}/{}".format(COLLECTION_PREFIX, get_collection_fqid()) collection_blob = self.s3_bucket.Object(collection_key) collection_blob.put(Body=json.dumps(collection_data).encode()) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) with self.subTest("bundle without deps"): bundle_key = "{}/{}".format( BUNDLE_PREFIX, BundleFQID(uuid=bundle_uuid, version=bundle_version)) bundle_blob = self.s3_bucket.Object(bundle_key) bundle_blob.put(Body=json.dumps(bundle_data).encode()) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) with self.subTest("file without deps"): file_key = "{}/{}".format( FILE_PREFIX, FileFQID(uuid=file_uuid, version=file_version)) file_blob = self.s3_bucket.Object(file_key) file_blob.put(Body=json.dumps(file_data).encode()) @eventually(timeout=8, interval=1, errors={Exception}) def check_file_revdeps(): self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, file_key)) check_file_revdeps() with self.subTest( "blob presence causes all dependencies to be resolved"): blob_key = compose_blob_key(file_data) blob_blob = self.s3_bucket.Object(blob_key) blob_blob.put(Body=b"sync_test") @eventually(timeout=8, interval=1, errors={Exception}) def check_blob_revdeps(): self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, file_key)) check_blob_revdeps()