Ejemplo n.º 1
0
def _patch_client_for_direct_access(client: DSSClient):
    old_get_file = client.get_file
    old_get_bundle = client.get_bundle
    mini_dss = MiniDSS(config.dss_endpoint)

    def new_get_file(self, uuid, replica, version=None):
        assert client is self
        try:
            blob = mini_dss.get_file(uuid, version, replica)
        except Exception:
            logger.warning(
                'Failed getting file %s, version %s directly. '
                'Falling back to official method', uuid, version)
            return old_get_file(uuid=uuid, version=version, replica=replica)
        else:
            return blob

    class NewGetBundle:
        def paginate(self, *args, **kwargs):
            uuid, version, replica = kwargs['uuid'], kwargs['version'], kwargs[
                'replica']
            try:
                bundle = mini_dss.get_bundle(uuid, version, replica)
            except Exception:
                logger.warning(
                    'Failed getting bundle file %s, version %s directly. '
                    'Falling back to official method', uuid, version)
                return old_get_bundle.paginate(*args, **kwargs)
            else:
                page = {'bundle': bundle, 'version': version, 'uuid': uuid}
                return [page]

    new_get_bundle = NewGetBundle()
    client.get_file = types.MethodType(new_get_file, client)
    client.get_bundle = new_get_bundle
Ejemplo n.º 2
0
class DataStoreAgent:
    DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json"
    DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json"

    def __init__(self, deployment):
        self.deployment = deployment
        if self.deployment == "prod":
            swagger_url = self.DSS_PROD_SWAGGER_URL
        else:
            swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format(
                deployment=deployment)
        self.client = DSSClient(swagger_url=swagger_url)

    def search(self, query, replica='aws'):
        try:
            response = self.client.post_search(replica=replica, es_query=query)
            return response['results']
        except SwaggerAPIException:
            return []

    def search_iterate(self, query, replica='aws'):
        for hit in self.client.post_search.iterate(replica=replica,
                                                   es_query=query):
            yield hit

    def download_bundle(self, bundle_uuid, target_folder):
        Progress.report(f"Downloading bundle {bundle_uuid}:\n")
        manifest = self.bundle_manifest(bundle_uuid)
        bundle_folder = os.path.join(target_folder, bundle_uuid)
        try:
            os.makedirs(bundle_folder)
        except FileExistsError:
            pass

        for f in manifest['bundle']['files']:
            self.download_file(f['uuid'],
                               save_as=os.path.join(bundle_folder, f['name']))
        return bundle_folder

    def bundle_manifest(self, bundle_uuid, replica='aws'):
        return self.client.get_bundle(replica=replica, uuid=bundle_uuid)

    def download_file(self, file_uuid, save_as, replica='aws'):
        Progress.report(f"Downloading file {file_uuid} to {save_as}\n")
        with self.client.get_file.stream(replica=replica,
                                         uuid=file_uuid) as fh:
            with open(save_as, "wb") as f:
                while True:
                    chunk = fh.raw.read(1024)
                    if chunk:
                        f.write(chunk)
                    else:
                        break

    def tombstone_bundle(self, bundle_uuid, replica='aws'):
        self.client.delete_bundle(replica=replica,
                                  uuid=bundle_uuid,
                                  reason="DCP-wide integration test")
Ejemplo n.º 3
0
from hca import HCAConfig
from hca.dss import DSSClient

hca_config = HCAConfig()
hca_config[
    'DSSClient'].swagger_url = f'https://dss.dev.data.humancellatlas.org/v1/swagger.json'
dss = DSSClient(config=hca_config)
for i in dss.post_search.iterate(replica='aws', es_query={}):
    uuid, version = i['bundle_fqid'].split('.', 1)
    try:
        s = f'Bundle: {uuid}.{version}\n'
        for j in dss.get_bundle(replica='aws', uuid=uuid,
                                version=version)['bundle']['files']:
            file_version = j['version']
            file_uuid = j['uuid']
            s += f'    File: {file_uuid}.{file_version}\n'
        print(s[:-1])
        break
    except:
        pass  # print(f'Does not exist: {uuid}.{version}')
Ejemplo n.º 4
0
def fetch_bundle():
    dss = DSSClient()
    return dss.get_bundle(replica="aws",
                          uuid="002aeac5-4d74-462d-baea-88f5c620cb50",
                          version="2019-08-01T200147.836900Z")
Ejemplo n.º 5
0
def download_bundle_metadata(
        client: DSSClient,
        replica: str,
        uuid: str,
        version: Optional[str] = None,
        directurls: bool = False,
        presignedurls: bool = False,
        num_workers: Optional[int] = None) -> Tuple[str, List[JSON], JSON]:
    """
    Download the metadata for a given bundle from the HCA data store (DSS).

    :param client: A DSS API client instance

    :param replica: The name of the DSS replica to use

    :param uuid: The UUID of the bundle in DSS

    :param version: The version of the bundle. if None, the most recent version of the bundle will be downloaded.

    :param directurls: Whether to include direct-access URLs in the response. This is mutually
                       exclusive with the presignedurls parameter. Note: including `directurls` and `presignedurls` in
                       the function call will cause the DSS to copy metadata and data files in the bundle to another
                       bucket first. That could be time-consuming and/or inefficient for users who only want to work
                       with the metadata instead of the files. It is very likely `directurls` and `presignedurls` will
                       be removed or changed in the future.

    :param presignedurls: A boolean controls whether to include presigned URLs in the response. This is mutually
                          exclusive with the directurls parameter. Note this parameter, similar to the `directurls`,
                          is a temporary parameter, and it's not guaranteed to stay in this place in the future.

    :param num_workers: The size of the thread pool to use for downloading metadata files in parallel. If None, the
                        default pool size will be used, typically a small multiple of the number of cores on the system
                        executing this function. If 0, no thread pool will be used and all files will be downloaded
                        sequentially by the current thread.

    :return: A tuple consisting of the version of the downloaded bundle, a list of the manifest entries for all files
             in the bundle (data and metadata) and a dictionary mapping the file name of each metadata file in the
             bundle to the JSON contents of that file.
    """
    if directurls or presignedurls:
        logger.warning(
            "PendingDeprecationWarning: `directurls` and `presignedurls` are temporary parameters and not"
            " guaranteed to stay in the code base in the future!")

    logger.debug("Getting bundle %s.%s from DSS.", uuid, version)
    # noinspection PyUnresolvedReferences
    response = client.get_bundle(uuid=uuid,
                                 version=version,
                                 replica=replica,
                                 directurls=directurls,
                                 presignedurls=presignedurls)
    bundle = response['bundle']
    manifest = bundle['files']
    metadata_files = {f["name"]: f for f in manifest if f["indexed"]}

    def download_file(item):
        file_name, manifest_entry = item
        file_uuid = manifest_entry['uuid']
        file_version = manifest_entry['version']
        logger.debug("Getting file '%s' (%s.%s) from DSS.", file_name,
                     file_uuid, file_version)
        # noinspection PyUnresolvedReferences
        return file_name, client.get_file(uuid=file_uuid,
                                          version=file_version,
                                          replica='aws')

    if num_workers == 0:
        metadata_files = map(download_file, metadata_files.items())
    else:
        with ThreadPoolExecutor(num_workers) as tpe:
            metadata_files = tpe.map(download_file, metadata_files.items())

    return bundle['version'], manifest, dict(metadata_files)