コード例 #1
0
 def test_to_key_prefix(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(
         BundleFQID(uuid=uuid, version=version).to_key_prefix(),
         f"{BUNDLE_PREFIX}/{uuid}.{version}")
     self.assertEquals(
         BundleFQID(uuid=uuid, version=None).to_key_prefix(),
         f"{BUNDLE_PREFIX}/{uuid}.")
コード例 #2
0
 def _read_bundle_manifest(cls, replica: Replica, fqid: BundleFQID) -> dict:
     handle = Config.get_blobstore_handle(replica)
     bucket_name = replica.bucket
     manifest_string = handle.get(bucket_name,
                                  fqid.to_key()).decode("utf-8")
     logger.debug(
         "Read bundle manifest from bucket %s with bundle key %s: %s",
         bucket_name, fqid.to_key(), manifest_string)
     manifest = json.loads(manifest_string, encoding="utf-8")
     return manifest
コード例 #3
0
ファイル: bundles.py プロジェクト: HumanCellAtlas/data-store
def _get_bundle_manifest(
        uuid: str,
        replica: Replica,
        version: typing.Optional[str],
        *,
        bucket: typing.Optional[str] = None) -> typing.Optional[dict]:
    """
    Return the contents of the bundle manifest file from cloud storage, subject to the rules of tombstoning.  If version
    is None, return the latest version, once again, subject to the rules of tombstoning.

    If the bundle cannot be found, return None
    """
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            BundleTombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        return None

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        return None

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_manifest_blob = handle.get(bucket,
                                          bundle_fqid.to_key()).decode("utf-8")
        return json.loads(bundle_manifest_blob)
    except BlobNotFoundError:
        return None
コード例 #4
0
 def test_from_key(self):
     """
     Test that the from key method correctly returns the right types of identifiers
     """
     uuid = "ca11ab1e-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(
         BundleFQID(uuid, version),
         ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         FileFQID(uuid, version),
         ObjectIdentifier.from_key(f"{FILE_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         CollectionFQID(uuid, version),
         ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}"),
     )
     self.assertEquals(
         CollectionTombstoneID(uuid, version),
         ObjectIdentifier.from_key(f"{COLLECTION_PREFIX}/{uuid}.{version}.dead"),
     )
     self.assertEquals(
         BundleTombstoneID(uuid, version),
         ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/{uuid}.{version}.dead"),
     )
     self.assertRaises(
         ValueError,
         lambda: ObjectIdentifier.from_key(f"{BUNDLE_PREFIX}/trash"),
     )
     self.assertRaises(
         ValueError,
         lambda: ObjectIdentifier.from_key(f"trash/{uuid}.{version}.dead"),
     )
コード例 #5
0
ファイル: __init__.py プロジェクト: HumanCellAtlas/data-store
def get_deleted_bundle_metadata_document(replica: Replica, key: str) -> dict:
    """
    Build the bundle metadata document assocated with a non-existent key.
    """
    fqid = BundleFQID.from_key(key)
    return dict(event_type="DELETE",
                bundle_info=dict(uuid=fqid.uuid, version=fqid.version))
コード例 #6
0
ファイル: bundles.py プロジェクト: HumanCellAtlas/data-store
def save_bundle_manifest(replica: Replica, uuid: str, version: str,
                         bundle: dict) -> typing.Tuple[bool, bool]:
    handle = Config.get_blobstore_handle(replica)
    data = json.dumps(bundle).encode("utf-8")
    fqid = BundleFQID(uuid, version).to_key()
    created, idempotent = idempotent_save(handle, replica.bucket, fqid, data)
    if created and idempotent:
        cache_key = _cache_key_template.format(replica=replica.name, fqid=fqid)
        _bundle_manifest_cache[cache_key] = bundle
    return created, idempotent
コード例 #7
0
 def test_tombstone_to_bundle_fqid(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertTrue(
         BundleTombstoneID(uuid=uuid, version=version).to_fqid(),
         BundleFQID(uuid=uuid, version=version),
     )
     self.assertRaises(
         ValueError,
         lambda: BundleTombstoneID(uuid=uuid, version=None).to_fqid(),
     )
コード例 #8
0
 def test_to_str(self):
     uuid = "0ddba11-0000-4a6b-8f0d-a7d2105c23be"
     version = "2017-12-05T235728.441373Z"
     self.assertEquals(str(BundleFQID(uuid=uuid, version=version)),
                       f"{uuid}.{version}")
     self.assertEquals(str(FileFQID(uuid=uuid, version=version)),
                       f"{uuid}.{version}")
     self.assertEquals(str(TombstoneID(uuid=uuid, version=version)),
                       f"{uuid}.{version}.dead")
     self.assertEquals(str(TombstoneID(uuid=uuid, version=None)),
                       f"{uuid}.dead")
コード例 #9
0
    class Iterator:
        keys = [BundleFQID(uuid=uuid.uuid4(),
                           version=datetime_to_version_format(datetime.datetime.utcnow())).to_key()
                for i in range(10)]

        def __init__(self, *args, **kwargs):
            self.start_after_key = None
            self.token = 'frank'

        def __iter__(self):
            for key in self.keys:
                self.start_after_key = key
                yield self.start_after_key
コード例 #10
0
ファイル: bundles.py プロジェクト: HumanCellAtlas/data-store
def get_bundle_manifest(
        uuid: str,
        replica: Replica,
        version: typing.Optional[str],
        *,
        bucket: typing.Optional[str] = None) -> typing.Optional[dict]:
    cache_key = _cache_key_template.format(replica=replica.name,
                                           fqid=BundleFQID(uuid,
                                                           version).to_key())
    if cache_key in _bundle_manifest_cache:
        return _bundle_manifest_cache[cache_key]
    else:
        bundle = _get_bundle_manifest(uuid, replica, version, bucket=bucket)
        if bundle is not None:
            _bundle_manifest_cache[cache_key] = bundle
        return bundle
コード例 #11
0
ファイル: bundles.py プロジェクト: HumanCellAtlas/data-store
    def __iter__(self):
        for key in self._keys():
            fqid = BundleFQID.from_key(key)
            if fqid.uuid != self.bundle_info['uuid']:
                for bundle_fqid in self._living_fqids_in_bundle_info():
                    yield bundle_fqid
                self._init_bundle_info(fqid)
            else:
                if not fqid.is_fully_qualified():
                    self.bundle_info['contains_unversioned_tombstone'] = True
                else:
                    self.bundle_info['fqids'][fqid] = isinstance(
                        fqid, BundleTombstoneID)

        for bundle_fqid in self._living_fqids_in_bundle_info():
            yield bundle_fqid
コード例 #12
0
ファイル: __init__.py プロジェクト: HumanCellAtlas/data-store
def build_bundle_metadata_document(replica: Replica, key: str) -> dict:
    """
    This returns a JSON document with bundle manifest and metadata files suitable for JMESPath filters.
    """
    handle = Config.get_blobstore_handle(replica)
    manifest = json.loads(handle.get(replica.bucket, key).decode("utf-8"))
    fqid = BundleFQID.from_key(key)
    bundle_info = dict(uuid=fqid.uuid, version=fqid.version)
    if key.endswith(TOMBSTONE_SUFFIX):
        return dict(event_type="TOMBSTONE",
                    bundle_info=bundle_info,
                    **manifest)
    else:
        lock = threading.Lock()
        files: dict = defaultdict(list)

        def _read_file(file_metadata):
            blob_key = "blobs/{}.{}.{}.{}".format(
                file_metadata['sha256'],
                file_metadata['sha1'],
                file_metadata['s3-etag'],
                file_metadata['crc32c'],
            )
            contents = handle.get(replica.bucket, blob_key).decode("utf-8")
            try:
                file_info = json.loads(contents)
            except json.decoder.JSONDecodeError:
                logging.info(f"{file_metadata['name']} not json decodable")
            else:
                # Modify name to avoid confusion with JMESPath syntax
                name = _dot_to_underscore_and_strip_numeric_suffix(
                    file_metadata['name'])
                with lock:
                    files[name].append(file_info)

        # TODO: Consider scaling parallelization with Lambda size
        with ThreadPoolExecutor(max_workers=4) as e:
            e.map(_read_file, [
                file_metadata for file_metadata in manifest['files']
                if file_metadata['content-type'].startswith("application/json")
            ])

        return dict(event_type="CREATE",
                    bundle_info=bundle_info,
                    manifest=manifest,
                    files=dict(files))
コード例 #13
0
ファイル: sync.py プロジェクト: HumanCellAtlas/data-store
def dependencies_exist(source_replica: Replica, dest_replica: Replica,
                       key: str):
    """
    Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in
    dest_replica:
     - Given a file manifest key, checks if blobs exist in dest_replica.
     - Given a bundle manifest key, checks if file manifests exist in dest_replica.
     - Given a collection key, checks if all collection contents exist in dest_replica.
    Returns true if all dependencies exist in dest_replica, false otherwise.
    """
    source_handle = Config.get_blobstore_handle(source_replica)
    dest_handle = Config.get_blobstore_handle(dest_replica)
    if key.endswith(TOMBSTONE_SUFFIX):
        return True
    elif key.startswith(FILE_PREFIX):
        file_id = FileFQID.from_key(key)
        file_manifest = get_json_metadata(
            entity_type="file",
            uuid=file_id.uuid,
            version=file_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        blob_path = compose_blob_key(file_manifest)
        if exists(dest_replica, blob_path):
            return True
    elif key.startswith(BUNDLE_PREFIX):
        # head all file manifests
        bundle_id = BundleFQID.from_key(key)
        bundle_manifest = get_json_metadata(
            entity_type="bundle",
            uuid=bundle_id.uuid,
            version=bundle_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            with ThreadPoolExecutor(max_workers=20) as e:
                futures = list()
                for file in bundle_manifest[BundleMetadata.FILES]:
                    file_uuid = file[BundleFileMetadata.UUID]
                    file_version = file[BundleFileMetadata.VERSION]
                    futures.append(
                        e.submit(get_json_metadata,
                                 entity_type="file",
                                 uuid=file_uuid,
                                 version=file_version,
                                 replica=dest_replica,
                                 blobstore_handle=source_handle,
                                 max_metadata_size=max_syncable_metadata_size))
                for future in as_completed(futures):
                    future.result()
            return True
        except Exception:
            pass
    elif key.startswith(COLLECTION_PREFIX):
        collection_id = CollectionFQID.from_key(key)
        collection_manifest = get_json_metadata(
            entity_type="collection",
            uuid=collection_id.uuid,
            version=collection_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            verify_collection(contents=collection_manifest["contents"],
                              replica=dest_replica,
                              blobstore_handle=dest_handle)
            return True
        except Exception:
            pass
    else:
        raise NotImplementedError("Unknown prefix for key {}".format(key))
    return False
コード例 #14
0
def put(uuid: str, replica: str, json_request_body: dict, version: str = None):
    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    handle = Config.get_blobstore_handle(Replica[replica])
    bucket = Replica[replica].bucket

    # what's the target object name for the bundle manifest?
    bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key()

    # decode the list of files.
    files = [{'user_supplied_metadata': file} for file in json_request_body['files']]

    time_left = nestedcontext.inject("time_left")

    while True:  # each time through the outer while-loop, we try to gather up all the file metadata.
        for file in files:
            user_supplied_metadata = file['user_supplied_metadata']
            metadata_key = FileFQID(
                uuid=user_supplied_metadata['uuid'],
                version=user_supplied_metadata['version'],
            ).to_key()
            if 'file_metadata' not in file:
                try:
                    file_metadata = handle.get(bucket, metadata_key)
                except BlobNotFoundError:
                    continue
                file['file_metadata'] = json.loads(file_metadata)
                if uuid != file['file_metadata']['bundle_uuid']:
                    raise DSSException(
                        requests.codes.conflict,
                        "incorrect_file_bundle_uuid",
                        f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}"
                    )

        # check to see if any file metadata is still not yet loaded.
        for file in files:
            if 'file_metadata' not in file:
                missing_file_user_metadata = file['user_supplied_metadata']
                break
        else:
            break

        # if we're out of time, give up.
        if time_left() > PUT_TIME_ALLOWANCE_SECONDS:
            time.sleep(1)
            continue

        raise DSSException(
            requests.codes.conflict,
            "file_missing",
            f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}."
        )

    # build a manifest consisting of all the files.
    bundle_metadata = {
        BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION,
        BundleMetadata.VERSION: version,
        BundleMetadata.FILES: [
            {
                BundleFileMetadata.NAME: file['user_supplied_metadata']['name'],
                BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'],
                BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'],
                BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE],
                BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE],
                BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'],
                BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C],
                BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG],
                BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1],
                BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256],
            }
            for file in files
        ],
        BundleMetadata.CREATOR_UID: json_request_body['creator_uid'],
    }

    created, idempotent = _idempotent_save(
        handle,
        bucket,
        bundle_manifest_key,
        bundle_metadata,
    )

    if not idempotent:
        raise DSSException(
            requests.codes.conflict,
            "bundle_already_exists",
            f"bundle with UUID {uuid} and version {version} already exists"
        )
    status_code = requests.codes.created if created else requests.codes.ok

    return jsonify(dict(version=version)), status_code
コード例 #15
0
def get_bundle_from_bucket(uuid: str,
                           replica: Replica,
                           version: typing.Optional[str],
                           bucket: typing.Optional[str],
                           directurls: bool = False):
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            TombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        raise DSSException(404, "not_found", "EMPTY Cannot find file!")

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_metadata = json.loads(
            handle.get(
                bucket,
                bundle_fqid.to_key(),
            ).decode("utf-8"))
    except BlobNotFoundError:
        raise DSSException(404, "not_found", "Cannot find file!")

    filesresponse = []  # type: typing.List[dict]
    for file in bundle_metadata[BundleMetadata.FILES]:
        file_version = {
            'name': file[BundleFileMetadata.NAME],
            'content-type': file[BundleFileMetadata.CONTENT_TYPE],
            'size': file[BundleFileMetadata.SIZE],
            'uuid': file[BundleFileMetadata.UUID],
            'version': file[BundleFileMetadata.VERSION],
            'crc32c': file[BundleFileMetadata.CRC32C],
            's3_etag': file[BundleFileMetadata.S3_ETAG],
            'sha1': file[BundleFileMetadata.SHA1],
            'sha256': file[BundleFileMetadata.SHA256],
            'indexed': file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=replica.storage_schema,
                netloc=bucket,
                path="blobs/{}.{}.{}.{}".format(
                    file[BundleFileMetadata.SHA256],
                    file[BundleFileMetadata.SHA1],
                    file[BundleFileMetadata.S3_ETAG],
                    file[BundleFileMetadata.CRC32C],
                ),
            ))
        filesresponse.append(file_version)

    return dict(bundle=dict(
        uuid=uuid,
        version=version,
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))
コード例 #16
0
ファイル: __init__.py プロジェクト: hannes-ucsc/data-store
def get_bundle_fqid() -> BundleFQID:
    return BundleFQID(uuid=str(uuid.uuid4()), version=get_version())
コード例 #17
0
    def test_dependencies_exist(self):
        file_uuid, file_version = str(uuid.uuid4()), get_version()
        bundle_uuid, bundle_version = str(uuid.uuid4()), get_version()
        collection_data = {
            "contents": [{
                "type": "bundle",
                "uuid": bundle_uuid,
                "version": bundle_version
            }, {
                "type": "file",
                "uuid": file_uuid,
                "version": file_version
            }]
        }
        bundle_data = {
            BundleMetadata.FILES: [{
                BundleFileMetadata.UUID: file_uuid,
                BundleFileMetadata.VERSION: file_version
            }]
        }
        file_data = {
            FileMetadata.SHA256: "sync_test",
            FileMetadata.SHA1: "sync_test",
            FileMetadata.S3_ETAG: "sync_test",
            FileMetadata.CRC32C: str(uuid.uuid4())
        }

        with self.subTest("collection without deps"):
            collection_key = "{}/{}".format(COLLECTION_PREFIX,
                                            get_collection_fqid())
            collection_blob = self.s3_bucket.Object(collection_key)
            collection_blob.put(Body=json.dumps(collection_data).encode())
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))

        with self.subTest("bundle without deps"):
            bundle_key = "{}/{}".format(
                BUNDLE_PREFIX,
                BundleFQID(uuid=bundle_uuid, version=bundle_version))
            bundle_blob = self.s3_bucket.Object(bundle_key)
            bundle_blob.put(Body=json.dumps(bundle_data).encode())

            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key))

        with self.subTest("file without deps"):
            file_key = "{}/{}".format(
                FILE_PREFIX, FileFQID(uuid=file_uuid, version=file_version))
            file_blob = self.s3_bucket.Object(file_key)
            file_blob.put(Body=json.dumps(file_data).encode())

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_file_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertFalse(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_file_revdeps()

        with self.subTest(
                "blob presence causes all dependencies to be resolved"):
            blob_key = compose_blob_key(file_data)
            blob_blob = self.s3_bucket.Object(blob_key)
            blob_blob.put(Body=b"sync_test")

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_blob_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_blob_revdeps()