def test_to_key_prefix(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(
         BundleFQID(uuid=uuid, version=version).to_key_prefix(),
         f"{BUNDLE_PREFIX}/{uuid}.{version}")
     self.assertEquals(
         BundleFQID(uuid=uuid, version=None).to_key_prefix(),
         f"{BUNDLE_PREFIX}/{uuid}.")
Example #2
0
 def _read_bundle_manifest(cls, replica: Replica, fqid: BundleFQID) -> dict:
     handle = Config.get_blobstore_handle(replica)
     bucket_name = replica.bucket
     manifest_string = handle.get(bucket_name,
                                  fqid.to_key()).decode("utf-8")
     logger.debug(
         "Read bundle manifest from bucket %s with bundle key %s: %s",
         bucket_name, fqid.to_key(), manifest_string)
     manifest = json.loads(manifest_string, encoding="utf-8")
     return manifest
Example #3
0
def _get_bundle_manifest(
        uuid: str,
        replica: Replica,
        version: typing.Optional[str],
        *,
        bucket: typing.Optional[str] = None) -> typing.Optional[dict]:
    """
    Return the contents of the bundle manifest file from cloud storage, subject to the rules of tombstoning.  If version
    is None, return the latest version, once again, subject to the rules of tombstoning.

    If the bundle cannot be found, return None
    """
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            BundleTombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        return None

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        return None

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_manifest_blob = handle.get(bucket,
                                          bundle_fqid.to_key()).decode("utf-8")
        return json.loads(bundle_manifest_blob)
    except BlobNotFoundError:
        return None
Example #4
0
 def test_from_key(self):
     """
     Test that the from key method correctly returns the right types of identifiers
     """
     uuid = "ca11ab1e-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(
         BundleFQID(uuid, version),
         ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         FileFQID(uuid, version),
         ObjectIdentifier.from_key(f"{FILE_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         CollectionFQID(uuid, version),
         ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         CollectionTombstoneID(uuid, version),
         ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}.dead"),
     )
     self.assertEquals(
         BundleTombstoneID(uuid, version),
         ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}.dead"),
     )
     self.assertRaises(
         ValueError,
         lambda: ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/trash"),
     )
     self.assertRaises(
         ValueError,
         lambda: ObjectIdentifier.from_key(f"trash/{uuid}.{version}.dead"),
     )
Example #5
0
def get_deleted_bundle_metadata_document(replica: Replica, key: str) -> dict:
    """
    Build the bundle metadata document assocated with a non-existent key.
    """
    fqid = BundleFQID.from_key(key)
    return dict(event_type="DELETE",
                bundle_info=dict(uuid=fqid.uuid, version=fqid.version))
Example #6
0
def save_bundle_manifest(replica: Replica, uuid: str, version: str,
                         bundle: dict) -> typing.Tuple[bool, bool]:
    handle = Config.get_blobstore_handle(replica)
    data = json.dumps(bundle).encode("utf-8")
    fqid = BundleFQID(uuid, version).to_key()
    created, idempotent = idempotent_save(handle, replica.bucket, fqid, data)
    if created and idempotent:
        cache_key = _cache_key_template.format(replica=replica.name, fqid=fqid)
        _bundle_manifest_cache[cache_key] = bundle
    return created, idempotent
Example #7
0
 def test_tombstone_to_bundle_fqid(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertTrue(
         BundleTombstoneID(uuid=uuid, version=version).to_fqid(),
         BundleFQID(uuid=uuid, version=version),
     )
     self.assertRaises(
         ValueError,
         lambda: BundleTombstoneID(uuid=uuid, version=None).to_fqid(),
     )
 def test_to_str(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(str(BundleFQID(uuid=uuid, version=version)),
                       f"{uuid}.{version}")
     self.assertEquals(str(FileFQID(uuid=uuid, version=version)),
                       f"{uuid}.{version}")
     self.assertEquals(str(TombstoneID(uuid=uuid, version=version)),
                       f"{uuid}.{version}.dead")
     self.assertEquals(str(TombstoneID(uuid=uuid, version=None)),
                       f"{uuid}.dead")
    class Iterator:
        keys = [BundleFQID(uuid=uuid.uuid4(),
                           version=datetime_to_version_format(datetime.datetime.utcnow())).to_key()
                for i in range(10)]

        def __init__(self, *args, **kwargs):
            self.start_after_key = None
            self.token = 'frank'

        def __iter__(self):
            for key in self.keys:
                self.start_after_key = key
                yield self.start_after_key
Example #10
0
def get_bundle_manifest(
        uuid: str,
        replica: Replica,
        version: typing.Optional[str],
        *,
        bucket: typing.Optional[str] = None) -> typing.Optional[dict]:
    cache_key = _cache_key_template.format(replica=replica.name,
                                           fqid=BundleFQID(uuid,
                                                           version).to_key())
    if cache_key in _bundle_manifest_cache:
        return _bundle_manifest_cache[cache_key]
    else:
        bundle = _get_bundle_manifest(uuid, replica, version, bucket=bucket)
        if bundle is not None:
            _bundle_manifest_cache[cache_key] = bundle
        return bundle
Example #11
0
    def __iter__(self):
        for key in self._keys():
            fqid = BundleFQID.from_key(key)
            if fqid.uuid != self.bundle_info['uuid']:
                for bundle_fqid in self._living_fqids_in_bundle_info():
                    yield bundle_fqid
                self._init_bundle_info(fqid)
            else:
                if not fqid.is_fully_qualified():
                    self.bundle_info['contains_unversioned_tombstone'] = True
                else:
                    self.bundle_info['fqids'][fqid] = isinstance(
                        fqid, BundleTombstoneID)

        for bundle_fqid in self._living_fqids_in_bundle_info():
            yield bundle_fqid
Example #12
0
def build_bundle_metadata_document(replica: Replica, key: str) -> dict:
    """
    This returns a JSON document with bundle manifest and metadata files suitable for JMESPath filters.
    """
    handle = Config.get_blobstore_handle(replica)
    manifest = json.loads(handle.get(replica.bucket, key).decode("utf-8"))
    fqid = BundleFQID.from_key(key)
    bundle_info = dict(uuid=fqid.uuid, version=fqid.version)
    if key.endswith(TOMBSTONE_SUFFIX):
        return dict(event_type="TOMBSTONE",
                    bundle_info=bundle_info,
                    **manifest)
    else:
        lock = threading.Lock()
        files: dict = defaultdict(list)

        def _read_file(file_metadata):
            blob_key = "blobs/{}.{}.{}.{}".format(
                file_metadata['sha256'],
                file_metadata['sha1'],
                file_metadata['s3-etag'],
                file_metadata['crc32c'],
            )
            contents = handle.get(replica.bucket, blob_key).decode("utf-8")
            try:
                file_info = json.loads(contents)
            except json.decoder.JSONDecodeError:
                logging.info(f"{file_metadata['name']} not json decodable")
            else:
                # Modify name to avoid confusion with JMESPath syntax
                name = _dot_to_underscore_and_strip_numeric_suffix(
                    file_metadata['name'])
                with lock:
                    files[name].append(file_info)

        # TODO: Consider scaling parallelization with Lambda size
        with ThreadPoolExecutor(max_workers=4) as e:
            e.map(_read_file, [
                file_metadata for file_metadata in manifest['files']
                if file_metadata['content-type'].startswith("application/json")
            ])

        return dict(event_type="CREATE",
                    bundle_info=bundle_info,
                    manifest=manifest,
                    files=dict(files))
Example #13
0
def dependencies_exist(source_replica: Replica, dest_replica: Replica,
                       key: str):
    """
    Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in
    dest_replica:
     - Given a file manifest key, checks if blobs exist in dest_replica.
     - Given a bundle manifest key, checks if file manifests exist in dest_replica.
     - Given a collection key, checks if all collection contents exist in dest_replica.
    Returns true if all dependencies exist in dest_replica, false otherwise.
    """
    source_handle = Config.get_blobstore_handle(source_replica)
    dest_handle = Config.get_blobstore_handle(dest_replica)
    if key.endswith(TOMBSTONE_SUFFIX):
        return True
    elif key.startswith(FILE_PREFIX):
        file_id = FileFQID.from_key(key)
        file_manifest = get_json_metadata(
            entity_type="file",
            uuid=file_id.uuid,
            version=file_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        blob_path = compose_blob_key(file_manifest)
        if exists(dest_replica, blob_path):
            return True
    elif key.startswith(BUNDLE_PREFIX):
        # head all file manifests
        bundle_id = BundleFQID.from_key(key)
        bundle_manifest = get_json_metadata(
            entity_type="bundle",
            uuid=bundle_id.uuid,
            version=bundle_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            with ThreadPoolExecutor(max_workers=20) as e:
                futures = list()
                for file in bundle_manifest[BundleMetadata.FILES]:
                    file_uuid = file[BundleFileMetadata.UUID]
                    file_version = file[BundleFileMetadata.VERSION]
                    futures.append(
                        e.submit(get_json_metadata,
                                 entity_type="file",
                                 uuid=file_uuid,
                                 version=file_version,
                                 replica=dest_replica,
                                 blobstore_handle=source_handle,
                                 max_metadata_size=max_syncable_metadata_size))
                for future in as_completed(futures):
                    future.result()
            return True
        except Exception:
            pass
    elif key.startswith(COLLECTION_PREFIX):
        collection_id = CollectionFQID.from_key(key)
        collection_manifest = get_json_metadata(
            entity_type="collection",
            uuid=collection_id.uuid,
            version=collection_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            verify_collection(contents=collection_manifest["contents"],
                              replica=dest_replica,
                              blobstore_handle=dest_handle)
            return True
        except Exception:
            pass
    else:
        raise NotImplementedError("Unknown prefix for key {}".format(key))
    return False
Example #14
0
def put(uuid: str, replica: str, json_request_body: dict, version: str = None):
    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    handle = Config.get_blobstore_handle(Replica[replica])
    bucket = Replica[replica].bucket

    # what's the target object name for the bundle manifest?
    bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key()

    # decode the list of files.
    files = [{'user_supplied_metadata': file} for file in json_request_body['files']]

    time_left = nestedcontext.inject("time_left")

    while True:  # each time through the outer while-loop, we try to gather up all the file metadata.
        for file in files:
            user_supplied_metadata = file['user_supplied_metadata']
            metadata_key = FileFQID(
                uuid=user_supplied_metadata['uuid'],
                version=user_supplied_metadata['version'],
            ).to_key()
            if 'file_metadata' not in file:
                try:
                    file_metadata = handle.get(bucket, metadata_key)
                except BlobNotFoundError:
                    continue
                file['file_metadata'] = json.loads(file_metadata)
                if uuid != file['file_metadata']['bundle_uuid']:
                    raise DSSException(
                        requests.codes.conflict,
                        "incorrect_file_bundle_uuid",
                        f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}"
                    )

        # check to see if any file metadata is still not yet loaded.
        for file in files:
            if 'file_metadata' not in file:
                missing_file_user_metadata = file['user_supplied_metadata']
                break
        else:
            break

        # if we're out of time, give up.
        if time_left() > PUT_TIME_ALLOWANCE_SECONDS:
            time.sleep(1)
            continue

        raise DSSException(
            requests.codes.conflict,
            "file_missing",
            f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}."
        )

    # build a manifest consisting of all the files.
    bundle_metadata = {
        BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION,
        BundleMetadata.VERSION: version,
        BundleMetadata.FILES: [
            {
                BundleFileMetadata.NAME: file['user_supplied_metadata']['name'],
                BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'],
                BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'],
                BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE],
                BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE],
                BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'],
                BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C],
                BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG],
                BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1],
                BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256],
            }
            for file in files
        ],
        BundleMetadata.CREATOR_UID: json_request_body['creator_uid'],
    }

    created, idempotent = _idempotent_save(
        handle,
        bucket,
        bundle_manifest_key,
        bundle_metadata,
    )

    if not idempotent:
        raise DSSException(
            requests.codes.conflict,
            "bundle_already_exists",
            f"bundle with UUID {uuid} and version {version} already exists"
        )
    status_code = requests.codes.created if created else requests.codes.ok

    return jsonify(dict(version=version)), status_code
Example #15
0
def get_bundle_from_bucket(uuid: str,
                           replica: Replica,
                           version: typing.Optional[str],
                           bucket: typing.Optional[str],
                           directurls: bool = False):
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            TombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        raise DSSException(404, "not_found", "EMPTY Cannot find file!")

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_metadata = json.loads(
            handle.get(
                bucket,
                bundle_fqid.to_key(),
            ).decode("utf-8"))
    except BlobNotFoundError:
        raise DSSException(404, "not_found", "Cannot find file!")

    filesresponse = []  # type: typing.List[dict]
    for file in bundle_metadata[BundleMetadata.FILES]:
        file_version = {
            'name': file[BundleFileMetadata.NAME],
            'content-type': file[BundleFileMetadata.CONTENT_TYPE],
            'size': file[BundleFileMetadata.SIZE],
            'uuid': file[BundleFileMetadata.UUID],
            'version': file[BundleFileMetadata.VERSION],
            'crc32c': file[BundleFileMetadata.CRC32C],
            's3_etag': file[BundleFileMetadata.S3_ETAG],
            'sha1': file[BundleFileMetadata.SHA1],
            'sha256': file[BundleFileMetadata.SHA256],
            'indexed': file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=replica.storage_schema,
                netloc=bucket,
                path="blobs/{}.{}.{}.{}".format(
                    file[BundleFileMetadata.SHA256],
                    file[BundleFileMetadata.SHA1],
                    file[BundleFileMetadata.S3_ETAG],
                    file[BundleFileMetadata.CRC32C],
                ),
            ))
        filesresponse.append(file_version)

    return dict(bundle=dict(
        uuid=uuid,
        version=version,
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))
Example #16
0
def get_bundle_fqid() -> BundleFQID:
    return BundleFQID(uuid=str(uuid.uuid4()), version=get_version())
Example #17
0
    def test_dependencies_exist(self):
        file_uuid, file_version = str(uuid.uuid4()), get_version()
        bundle_uuid, bundle_version = str(uuid.uuid4()), get_version()
        collection_data = {
            "contents": [{
                "type": "bundle",
                "uuid": bundle_uuid,
                "version": bundle_version
            }, {
                "type": "file",
                "uuid": file_uuid,
                "version": file_version
            }]
        }
        bundle_data = {
            BundleMetadata.FILES: [{
                BundleFileMetadata.UUID: file_uuid,
                BundleFileMetadata.VERSION: file_version
            }]
        }
        file_data = {
            FileMetadata.SHA256: "sync_test",
            FileMetadata.SHA1: "sync_test",
            FileMetadata.S3_ETAG: "sync_test",
            FileMetadata.CRC32C: str(uuid.uuid4())
        }

        with self.subTest("collection without deps"):
            collection_key = "{}/{}".format(COLLECTION_PREFIX,
                                            get_collection_fqid())
            collection_blob = self.s3_bucket.Object(collection_key)
            collection_blob.put(Body=json.dumps(collection_data).encode())
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))

        with self.subTest("bundle without deps"):
            bundle_key = "{}/{}".format(
                BUNDLE_PREFIX,
                BundleFQID(uuid=bundle_uuid, version=bundle_version))
            bundle_blob = self.s3_bucket.Object(bundle_key)
            bundle_blob.put(Body=json.dumps(bundle_data).encode())

            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key))

        with self.subTest("file without deps"):
            file_key = "{}/{}".format(
                FILE_PREFIX, FileFQID(uuid=file_uuid, version=file_version))
            file_blob = self.s3_bucket.Object(file_key)
            file_blob.put(Body=json.dumps(file_data).encode())

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_file_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertFalse(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_file_revdeps()

        with self.subTest(
                "blob presence causes all dependencies to be resolved"):
            blob_key = compose_blob_key(file_data)
            blob_blob = self.s3_bucket.Object(blob_key)
            blob_blob.put(Body=b"sync_test")

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_blob_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_blob_revdeps()