Esempio n. 1
0
def _patch_client_for_direct_access(client: DSSClient):
    old_get_file = client.get_file
    old_get_bundle = client.get_bundle
    mini_dss = MiniDSS(config.dss_endpoint)

    def new_get_file(self, uuid, replica, version=None):
        assert client is self
        try:
            blob = mini_dss.get_file(uuid, version, replica)
        except Exception:
            logger.warning(
                'Failed getting file %s, version %s directly. '
                'Falling back to official method', uuid, version)
            return old_get_file(uuid=uuid, version=version, replica=replica)
        else:
            return blob

    class NewGetBundle:
        def paginate(self, *args, **kwargs):
            uuid, version, replica = kwargs['uuid'], kwargs['version'], kwargs[
                'replica']
            try:
                bundle = mini_dss.get_bundle(uuid, version, replica)
            except Exception:
                logger.warning(
                    'Failed getting bundle file %s, version %s directly. '
                    'Falling back to official method', uuid, version)
                return old_get_bundle.paginate(*args, **kwargs)
            else:
                page = {'bundle': bundle, 'version': version, 'uuid': uuid}
                return [page]

    new_get_bundle = NewGetBundle()
    client.get_file = types.MethodType(new_get_file, client)
    client.get_bundle = new_get_bundle
Esempio n. 2
0
    def test_swagger_client_no_refresh(self):
        """
        Instantiates the normal DSSClient with a 3600 second expiration token so that we can check
        that it successfully uses the same token for both requests.
        """
        dss = DSSClient(
            swagger_url=
            'https://dss.dev.data.humancellatlas.org/v1/swagger.json')
        assert dss._authenticated_session is None

        # we use collections to test because it's an authenticated endpoint
        r = dss.get_collections()
        assert 'collections' in r
        token_one = dss._authenticated_session.token['access_token']
        expires_at = dss._authenticated_session.token[
            'expires_at'] - time.time()
        assert expires_at < 3600
        assert expires_at > 3590

        time.sleep(2)

        r = dss.get_collections()
        assert 'collections' in r
        token_two = dss._authenticated_session.token['access_token']
        expires_at = dss._authenticated_session.token[
            'expires_at'] - time.time()
        assert expires_at < 3600
        assert expires_at > 3590

        assert token_one == token_two  # we used one long-lived token for both requests
Esempio n. 3
0
 def __init__(self, deployment):
     self.deployment = deployment
     if self.deployment == "prod":
         swagger_url = self.DSS_PROD_SWAGGER_URL
     else:
         swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format(
             deployment=deployment)
     self.client = DSSClient(swagger_url=swagger_url)
Esempio n. 4
0
class DataStoreAgent:
    DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json"
    DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json"

    def __init__(self, deployment):
        self.deployment = deployment
        if self.deployment == "prod":
            swagger_url = self.DSS_PROD_SWAGGER_URL
        else:
            swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format(
                deployment=deployment)
        self.client = DSSClient(swagger_url=swagger_url)

    def search(self, query, replica='aws'):
        try:
            response = self.client.post_search(replica=replica, es_query=query)
            return response['results']
        except SwaggerAPIException:
            return []

    def search_iterate(self, query, replica='aws'):
        for hit in self.client.post_search.iterate(replica=replica,
                                                   es_query=query):
            yield hit

    def download_bundle(self, bundle_uuid, target_folder):
        Progress.report(f"Downloading bundle {bundle_uuid}:\n")
        manifest = self.bundle_manifest(bundle_uuid)
        bundle_folder = os.path.join(target_folder, bundle_uuid)
        try:
            os.makedirs(bundle_folder)
        except FileExistsError:
            pass

        for f in manifest['bundle']['files']:
            self.download_file(f['uuid'],
                               save_as=os.path.join(bundle_folder, f['name']))
        return bundle_folder

    def bundle_manifest(self, bundle_uuid, replica='aws'):
        return self.client.get_bundle(replica=replica, uuid=bundle_uuid)

    def download_file(self, file_uuid, save_as, replica='aws'):
        Progress.report(f"Downloading file {file_uuid} to {save_as}\n")
        with self.client.get_file.stream(replica=replica,
                                         uuid=file_uuid) as fh:
            with open(save_as, "wb") as f:
                while True:
                    chunk = fh.raw.read(1024)
                    if chunk:
                        f.write(chunk)
                    else:
                        break

    def tombstone_bundle(self, bundle_uuid, replica='aws'):
        self.client.delete_bundle(replica=replica,
                                  uuid=bundle_uuid,
                                  reason="DCP-wide integration test")
def _get_dss_client():
    if stage == "prod":
        dss_client = DSSClient(
            swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    else:
        dss_client = DSSClient(
            swagger_url=
            f"https://dss.{stage}.data.humancellatlas.org/v1/swagger.json")
    return dss_client
def get_dss_generator(query):
    dss_client = DSSClient(
        swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    bundle_generator = dss_client.post_search.iterate(replica="aws",
                                                      es_query=query,
                                                      output_format="raw")
    total_hits = dss_client.post_search(replica="aws",
                                        es_query=query,
                                        output_format="raw").get('total_hits')
    return (bundle_generator, total_hits)
Esempio n. 7
0
def manage_subscriptions(plugin: RepositoryPlugin,
                         dss_client: DSSClient,
                         subscribe=True):
    response = dss_client.get_subscriptions(replica='aws',
                                            subscription_type='elasticsearch')
    current_subscriptions = freeze(response['subscriptions'])

    key, key_id = deployment.aws.get_hmac_key_and_id()

    if subscribe:
        base_url = config.indexer_endpoint()
        prefix = config.dss_query_prefix
        new_subscriptions = [
            freeze(
                dict(replica='aws',
                     es_query=query(prefix),
                     callback_url=furl(url=base_url,
                                       path=(config.default_catalog, action)),
                     hmac_key_id=key_id))
            for query, action in [(
                plugin.dss_subscription_query,
                'add'), (plugin.dss_deletion_subscription_query, 'delete')]
        ]
    else:
        new_subscriptions = []

    for subscription in current_subscriptions:
        # Note the use of <= to allow for the fact that DSS returns subscriptions with additional attributes, more
        # than were originally supplied. If the subscription returned by DSS is a superset of the subscription we want
        # to create, we can skip the update.
        matching_subscription = next(
            (new_subscription for new_subscription in new_subscriptions
             if new_subscription.items() <= subscription.items()), None)
        if matching_subscription:
            logger.info('Already subscribed: %r', thaw(subscription))
            new_subscriptions.remove(matching_subscription)
        else:
            subscription = thaw(subscription)
            logger.info('Removing stale subscription: %r', subscription)
            dss_client.delete_subscription(uuid=subscription['uuid'],
                                           replica=subscription['replica'],
                                           subscription_type='elasticsearch')

    for subscription in new_subscriptions:
        subscription = thaw(subscription)
        response = dss_client.put_subscription(**subscription,
                                               hmac_secret_key=key)
        subscription['uuid'] = response['uuid']
        logger.info('Registered subscription %r.', subscription)
Esempio n. 8
0
def dss_client(deployment: Optional[str] = None) -> DSSClient:
    """
    Return a DSS client to DSS production or the specified DSS deployment.

    :param deployment: The name of a DSS deployment like `dev`, `integration` or `staging`. If None, the production
                       deployment (`prod`) will be used.
    """
    # Work around https://github.com/HumanCellAtlas/dcp-cli/issues/142
    hca_config = HCAConfig()
    deployment = deployment + "." if deployment else ""
    hca_config[
        'DSSClient'].swagger_url = f'https://dss.{deployment}data.humancellatlas.org/v1/swagger.json'
    # Clear the cached swagger specs that may come from a different deployment. This work-around isn't thread safe but
    # neither is the caching iteself.
    DSSClient._swagger_spec = None
    client = DSSClient(config=hca_config)
    client.timeout_policy = Timeout(connect=10, read=40)
    return client
Esempio n. 9
0
def main():
    logging.basicConfig(level=logging.INFO)
    hca_config = HCAConfig()
    hca_config[
        "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
    dss = DSSClient(config=hca_config)

    projects = get_target_project_dirs(follow_links=True)

    for project in projects:
        log.info('Uploading %s', project)
        bundle_uuid = project.name
        assert str(UUID(bundle_uuid)) == bundle_uuid
        bundle = project / 'bundle'

        def file_uuid_callback(file_path: str):
            file_path = Path(file_path)
            file_name = file_path.name
            file_uuid = generate_file_uuid(bundle_uuid, file_name)
            log.info('Allocated UUID %s for file %s', file_uuid, file_path)
            if file_name.endswith('.json'):
                with file_path.open('rt') as f:
                    document = json.load(f)
                    if file_name == 'links.json':
                        pass
                    elif file_name == 'project_0.json':
                        assert document['provenance'][
                            'document_id'] == bundle_uuid
                    else:
                        assert document['provenance'][
                            'document_id'] == file_uuid
            return file_uuid

        if bundle.is_dir():
            response = dss.upload(src_dir=str(bundle),
                                  replica='aws',
                                  staging_bucket='lon-test-data',
                                  bundle_uuid=bundle_uuid,
                                  file_uuid_callback=file_uuid_callback)
            print(
                f'Successful upload.  Bundle information is:\n{json.dumps(response, indent=4)}'
            )
        else:
            log.warning('Skipping %s because metadata is missing', project)
Esempio n. 10
0
    def _test_dss_client(self, direct: bool, query: JSON,
                         dss_client: DSSClient, replica: str, fallback: bool):
        with self.subTest(direct=direct, replica=replica, fallback=fallback):
            response = dss_client.post_search(es_query=query,
                                              replica=replica,
                                              per_page=10)
            bundle_uuid, _, bundle_version = response['results'][0][
                'bundle_fqid'].partition('.')
            with mock.patch('azul.dss.logger') as captured_log:
                _, manifest, metadata = download_bundle_metadata(
                    client=dss_client,
                    replica=replica,
                    uuid=bundle_uuid,
                    version=bundle_version,
                    num_workers=config.num_dss_workers)
            log.info('Captured log calls: %r', captured_log.mock_calls)
            self.assertGreater(len(metadata), 0)
            self.assertGreater(set(f['name'] for f in manifest),
                               set(metadata.keys()))
            for f in manifest:
                self.assertIn('s3_etag', f)
            # Extract the log method name and the first three words of log
            # message logged. Note that the PyCharm debugger will call
            # certain dunder methods on the variable, leading to failed
            # assertions.
            actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3]))
                      for m, a, k in captured_log.mock_calls]
            if direct:
                if replica == 'aws':
                    if fallback:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s'),
                                    ('warning', 'Error accessing bundle'),
                                    ('warning', 'Failed getting bundle')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),
                                        ('warning', 'Error accessing file'),
                                        ('warning', 'Failed getting file')
                                    ] * len(metadata)
                    else:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),  # file
                                        ('debug', 'Loading object %s')  # blob
                                    ] * len(metadata)

                else:
                    # On `gcp` the precondition check fails right away, preventing any attempts of direct access
                    expected = [
                        ('warning', 'Failed getting bundle')
                    ] + [('warning', 'Failed getting file')] * len(metadata)
            else:
                expected = []
            self.assertSequenceEqual(sorted(expected), sorted(actual))
Esempio n. 11
0
    def __init__(self, dss_host):
        """
        Create an instance of the DataExtractor.

        It takes the formatted url of the DSS
        (e.g. https://dss.staging.data.humancellatlas.org/v1) to which
        to talk to.

        :param dss_host: The formatted url for the DSS
        """
        self.dss_client = DSSClient()
        self.dss_client.host = dss_host
        self.log = logging.getLogger(indexer_name + ".indexer.DataExtractor")
Esempio n. 12
0
def get_dss_generator(hca_project_uuid):
    # files.project_json.provenance.document_id project uuid you want to retreive
    # exists files.project_json.provenance.document_id removes test bundles
    # "files.analysis_process_json.type.text": "analysis" look at primary bundles only don't return analysis bundles

    query = {
        "query": {
            "bool": {
                "must": [{
                    "match": {
                        "files.project_json.provenance.document_id": ""
                    }
                }, {
                    "exists": {
                        "field": "files.project_json.provenance.document_id"
                    }
                }],
                "must_not": [{
                    "match": {
                        "files.analysis_process_json.type.text": "analysis"
                    }
                }]
            }
        }
    }

    query.get("query").get("bool").get("must")[0].get("match")[
        "files.project_json.provenance.document_id"] = hca_project_uuid

    dss_client = DSSClient(
        swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    bundle_generator = dss_client.post_search.iterate(replica="aws",
                                                      es_query=query,
                                                      output_format="raw")
    total_hits = dss_client.post_search(replica="aws",
                                        es_query=query,
                                        output_format="raw").get('total_hits')
    return (bundle_generator, total_hits)
    def setUpClass(cls):
        super().setUpClass()
        cls.dss_endpoint = os.getenv("TEST_DSS_ENDPOINT", "https://hca-dss-4.ucsc-cgp-dev.org/v1")
        cls.staging_bucket = os.getenv('DSS_S3_STAGING_BUCKET', 'commons-dss-upload')

        # Work around problems with DSSClient initialization when there is
        # existing HCA configuration. The following issue has been submitted:
        # Problems accessing an alternate DSS from user scripts or unit tests #170
        # https://github.com/HumanCellAtlas/dcp-cli/issues/170
        monkey_patch_hca_config()
        HCAConfig._user_config_home = '/tmp/'
        dss_config = HCAConfig(name='loader-test', save_on_exit=False, autosave=False)
        dss_config['DSSClient'].swagger_url = f'{cls.dss_endpoint}/swagger.json'
        cls.dss_client = DSSClient(config=dss_config)
Esempio n. 14
0
    def __init__(self, dss_endpoint: str, staging_bucket: str,
                 google_project_id: str, dry_run: bool) -> None:
        """
        Functions for uploading files to a given DSS.

        :param dss_endpoint: The URL to a Swagger DSS API.  e.g. "https://commons-dss.ucsc-cgp-dev.org/v1"
        :param staging_bucket: The name of the AWS S3 bucket to be used when staging files for uploading
        to the DSS. As an example, local files are uploaded to the staging bucket, then file metadata tags
        required by the DSS are assigned to it, then the file is loaded into the DSS (by copy).
        The bucket must be accessible by the DSS. .e.g. 'commons-dss-upload'
        :param google_project_id: A Google `Project ID` to be used when accessing GCP requester pays buckets.
        e.g. "platform-dev-178517"
        One way to find a `Project ID` is provided here:
        https://console.cloud.google.com/cloud-resource-manager
        :param dry_run: If True, log the actions that would be performed yet don't actually execute them.
        Otherwise, actually perform the operations.
        """
        self.dss_endpoint = dss_endpoint
        self.staging_bucket = staging_bucket
        self.google_project_id = google_project_id
        self.dry_run = dry_run
        self.s3_client = boto3.client("s3")
        self.s3_blobstore = s3.S3BlobStore(self.s3_client)
        self.gs_client = Client()

        # Work around problems with DSSClient initialization when there is
        # existing HCA configuration. The following issue has been submitted:
        # Problems accessing an alternate DSS from user scripts or unit tests #170
        # https://github.com/HumanCellAtlas/dcp-cli/issues/170
        monkey_patch_hca_config()
        HCAConfig._user_config_home = '/tmp/'
        dss_config = HCAConfig(name='loader',
                               save_on_exit=False,
                               autosave=False)
        dss_config[
            'DSSClient'].swagger_url = f'{self.dss_endpoint}/swagger.json'
        self.dss_client = DSSClient(config=dss_config)
Esempio n. 15
0
from hca.dss import DSSClient

dss = DSSClient()

dss.download(
    bundle_uuid="ffffaf55-f19c-40e3-aa81-a6c69d357265",
    version="2019-08-01T200147.836832Z",
    replica="aws",
    download_dir="download_test",
)
Esempio n. 16
0
def get_client():
    global client
    if client is None:
        from hca.dss import DSSClient
        client = DSSClient()
    return client
Esempio n. 17
0
from hca.dss import DSSClient
import json

dss = DSSClient()

json_response = dss.get_file(replica="aws",
                             uuid="666ff3f0-67a1-4ead-82e9-3f96a8c0a9b1")

for content in json_response:
    print(f"{content}: {json.dumps(json_response[content], indent=4)}")
Esempio n. 18
0
from hca.dss import DSSClient

dss = DSSClient()

UUID = "ffffaf55-f19c-40e3-aa81-a6c69d357265"
VERSION = "ffffaf55-f19c-40e3-aa81-a6c69d357265"

# Download the metadata only
dss.download(bundle_uuid=UUID,
             version=VERSION,
             replica="aws",
             download_dir=".hca_metadata_only")

# Download the data only
dss.download(bundle_uuid=UUID,
             version=VERSION,
             replica="aws",
             download_dir=".hca_data_only")
Esempio n. 19
0
from hca import HCAConfig
from hca.dss import DSSClient

hca_config = HCAConfig()

hca_config[
    "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
dss = DSSClient(config=hca_config)

print(
    dss.put_file(
        uuid="ead6434d-efb5-4554-98bc-027e160547c5",
        version="2019-07-30T174916.268875Z",
        creator_uid=0,
        source_url=
        "s3://jeffwu-test/ead6434d-efb5-4554-98bc-027e160547c5/get_bundle.json",
    ))
Esempio n. 20
0
from hca.dss import DSSClient

dss = DSSClient()

dss.create_version()
Esempio n. 21
0
from hca.dss import DSSClient

dss = DSSClient()

checkout_id = dss.post_bundles_checkout(
    uuid="fff746b3-e3eb-496a-88a3-5fa1fa358392", replica="aws")
print(checkout_id)
Esempio n. 22
0
from hca.dss import DSSClient

dss = DSSClient()

# Note:
# Passing es_query={} runs an empty search, which will match all bundles.

# Iterable post_search
for results in dss.post_search.iterate(replica="aws", es_query={}):
    print(results)
    break

# Non-iterable (first page only) post_search
print(dss.post_search(replica='aws', es_query={}))
Esempio n. 23
0
def bundle_url_iterator():

    dss_client = DSSClient(swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    q = {
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "files.library_preparation_protocol_json.library_construction_approach.ontology": "EFO:0008931"
                        }
                    },
                    {
                        "match": {
                            "files.sequencing_protocol_json.paired_end": 'true'
                        }
                    },
                    {
                        "match": {
                            "files.donor_organism_json.biomaterial_core.ncbi_taxon_id": 9606
                        }
                    }
                ],
                "should": [
                    {
                        "match": {
                            "files.dissociation_protocol_json.dissociation_method.ontology": "EFO:0009128"
                        }
                    },
                    {
                        "match": {
                            "files.dissociation_protocol_json.dissociation_method.text": "mouth pipette"
                        }
                    }
                ],
                "must_not": [
                    {
                        "terms": {
                            "files.project_json.provenance.document_id": [
                                "1630e3dc-5501-4faf-9726-2e2c0b4da6d7",
                                "fd1d163d-d6a7-41cd-b3bc-9d77ba9a36fe",
                                "2a0faf83-e342-4b1c-bb9b-cf1d1147f3bb",
                                "cf8439db-fcc9-44a8-b66f-8ffbf729bffa",
                                "6b9f514d-d738-403f-a9c2-62580bbe5c83",
                                "311d013c-01e4-42c0-9c2d-25472afa9cbc",
                                "d237ed6a-3a7f-4a91-b300-b070888a8542",
                                "e6cc0b02-2125-4faa-9903-a9025a62efec",
                                "e4dbcb98-0562-4071-8bea-5e8de5f3c147",
                                "e79e9284-c337-4dfd-853d-66fa3facfbbd",
                                "560cd061-9165-4699-bc6e-8253e164c079",
                                "e83fda0e-6515-4f13-82cb-a5860ecfc2d4",
                                "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                                "71a6e049-4846-4c2a-8823-cc193c573efc",
                                "4b5a2268-507c-46e6-bab0-3efb30145e85",
                                "364ebb73-652e-4d32-8938-1c922d0b2584",
                                "11f5d59b-0e2c-4f01-85ac-8d8dd3db53be",
                                "c1996526-6466-40ff-820f-dad4d63492ec",
                                "c281dedc-e838-4464-bf51-1cc4efae3fb9",
                                "40afcf6b-422a-47ba-ba7a-33678c949b5c",
                                "71a6e049-4846-4c2a-8823-cc193c573efc",
                                "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                                "0facfacd-5b0c-4228-8be5-37aa1f3a269d",
                                "76c209df-42bf-41dc-a5f5-3d27193ca7a6",
                                "bb409c34-bb87-4ed2-adaf-6d1ef10610b5",
                                "1a6b5e5d-914f-4dd6-8817-a1f9b7f364d5",
                                "dd401943-1059-4b2d-b187-7a9e11822f95"
                            ]
                        }
                    }
                ]
            }
        }
    }

    q2 = {
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "files.library_preparation_protocol_json.library_construction_approach.ontology": "EFO:0008931"
                        }
                    },
                    {
                        "match": {
                            "files.donor_organism_json.biomaterial_core.ncbi_taxon_id": 9606
                        }
                    }
                ],
                "must_not": [
                    {
                        "terms": {
                            "files.project_json.provenance.document_id": [
                                "1630e3dc-5501-4faf-9726-2e2c0b4da6d7",
                                "fd1d163d-d6a7-41cd-b3bc-9d77ba9a36fe",
                                "2a0faf83-e342-4b1c-bb9b-cf1d1147f3bb",
                                "cf8439db-fcc9-44a8-b66f-8ffbf729bffa",
                                "6b9f514d-d738-403f-a9c2-62580bbe5c83",
                                "311d013c-01e4-42c0-9c2d-25472afa9cbc",
                                "d237ed6a-3a7f-4a91-b300-b070888a8542",
                                "e6cc0b02-2125-4faa-9903-a9025a62efec",
                                "e4dbcb98-0562-4071-8bea-5e8de5f3c147",
                                "e79e9284-c337-4dfd-853d-66fa3facfbbd",
                                "560cd061-9165-4699-bc6e-8253e164c079",
                                "e83fda0e-6515-4f13-82cb-a5860ecfc2d4",
                                "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                                "71a6e049-4846-4c2a-8823-cc193c573efc",
                                "4b5a2268-507c-46e6-bab0-3efb30145e85",
                                "364ebb73-652e-4d32-8938-1c922d0b2584",
                                "11f5d59b-0e2c-4f01-85ac-8d8dd3db53be",
                                "c1996526-6466-40ff-820f-dad4d63492ec",
                                "c281dedc-e838-4464-bf51-1cc4efae3fb9",
                                "40afcf6b-422a-47ba-ba7a-33678c949b5c",
                                "71a6e049-4846-4c2a-8823-cc193c573efc",
                                "9a60e8c2-32ea-4586-bc1f-7ee58f462b07",
                                "0facfacd-5b0c-4228-8be5-37aa1f3a269d",
                                "76c209df-42bf-41dc-a5f5-3d27193ca7a6",
                                "bb409c34-bb87-4ed2-adaf-6d1ef10610b5",
                                "1a6b5e5d-914f-4dd6-8817-a1f9b7f364d5",
                                "dd401943-1059-4b2d-b187-7a9e11822f95"
                            ]
                        }
                    }
                ]
            }
        }
    }


    # return dss_client.post_search.iterate(replica="aws", es_query=q2) #iterator of bundles
    return dss_client.post_search.iterate(replica="aws", es_query=q2, output_format="raw")
Esempio n. 24
0
class DssUploader:
    def __init__(self, dss_endpoint: str, staging_bucket: str,
                 google_project_id: str, dry_run: bool) -> None:
        """
        Functions for uploading files to a given DSS.

        :param dss_endpoint: The URL to a Swagger DSS API.  e.g. "https://commons-dss.ucsc-cgp-dev.org/v1"
        :param staging_bucket: The name of the AWS S3 bucket to be used when staging files for uploading
        to the DSS. As an example, local files are uploaded to the staging bucket, then file metadata tags
        required by the DSS are assigned to it, then the file is loaded into the DSS (by copy).
        The bucket must be accessible by the DSS. .e.g. 'commons-dss-upload'
        :param google_project_id: A Google `Project ID` to be used when accessing GCP requester pays buckets.
        e.g. "platform-dev-178517"
        One way to find a `Project ID` is provided here:
        https://console.cloud.google.com/cloud-resource-manager
        :param dry_run: If True, log the actions that would be performed yet don't actually execute them.
        Otherwise, actually perform the operations.
        """
        self.dss_endpoint = dss_endpoint
        self.staging_bucket = staging_bucket
        self.google_project_id = google_project_id
        self.dry_run = dry_run
        self.s3_client = boto3.client("s3")
        self.s3_blobstore = s3.S3BlobStore(self.s3_client)
        self.gs_client = Client()

        # Work around problems with DSSClient initialization when there is
        # existing HCA configuration. The following issue has been submitted:
        # Problems accessing an alternate DSS from user scripts or unit tests #170
        # https://github.com/HumanCellAtlas/dcp-cli/issues/170
        monkey_patch_hca_config()
        HCAConfig._user_config_home = '/tmp/'
        dss_config = HCAConfig(name='loader',
                               save_on_exit=False,
                               autosave=False)
        dss_config[
            'DSSClient'].swagger_url = f'{self.dss_endpoint}/swagger.json'
        self.dss_client = DSSClient(config=dss_config)

    def upload_cloud_file_by_reference(self,
                                       filename: str,
                                       file_uuid: str,
                                       file_cloud_urls: set,
                                       bundle_uuid: str,
                                       guid: str,
                                       file_version: str = None) -> tuple:
        """
        Loads the given cloud file into the DSS by reference, rather than by copying it into the DSS.
        Because the HCA DSS per se does not support loading by reference, this is currently implemented
        using the approach described here:
        https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78

        This is conceptually similar to creating a "symbolic link" to the cloud file rather than copying the
        source file into the DSS.
        The file's metadata is obtained, formatted as a dictionary, then this dictionary is uploaded as
        as a json file with content type `dss-type=fileref` into the DSS.

        A request has been made for the HCA data-store to support loading by reference as a feature of the
        data store, here: https://github.com/HumanCellAtlas/data-store/issues/912

        :param filename: The name of the file in the bucket.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
                                e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
        :param bundle_uuid: n RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param guid: An optional additional/alternate data identifier/alias to associate with the file
        e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        def _create_file_reference(file_cloud_urls: set, guid: str) -> dict:
            """
            Format a file's metadata into a dictionary for uploading as a json to support the approach
            described here:
            https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78

            :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
                                    e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
            :param guid: An optional additional/alternate data identifier/alias to associate with the file
            e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
            :param file_version: RFC3339 formatted timestamp.
            :return: A dictionary of metadata values.
            """
            s3_metadata = None
            gs_metadata = None
            for cloud_url in file_cloud_urls:
                url = urlparse(cloud_url)
                bucket = url.netloc
                key = url.path[1:]
                if url.scheme == "s3":
                    s3_metadata = _get_s3_file_metadata(bucket, key)
                elif url.scheme == "gs":
                    gs_metadata = _get_gs_file_metadata(bucket, key)
                else:
                    raise FileURLError(
                        "Unsupported cloud URL scheme: {cloud_url}")
            return _consolidate_metadata(file_cloud_urls, s3_metadata,
                                         gs_metadata, guid)

        def _get_s3_file_metadata(bucket: str, key: str) -> dict:
            """
            Format an S3 file's metadata into a dictionary for uploading as a json.

            :param bucket: Name of an S3 bucket
            :param key: S3 file to upload.  e.g. 'output.txt' or 'data/output.txt'
            :return: A dictionary of metadata values.
            """
            metadata = dict()
            try:
                response = self.s3_client.head_object(Bucket=bucket,
                                                      Key=key,
                                                      RequestPayer="requester")
                metadata['content-type'] = response['ContentType']
                metadata['s3_etag'] = response['ETag']
                metadata['size'] = response['ContentLength']
            except Exception as e:
                raise FileURLError(
                    f"Error accessing s3://{bucket}/{key}") from e
            return metadata

        def _get_gs_file_metadata(bucket: str, key: str) -> dict:
            """
            Format a GS file's metadata into a dictionary for uploading as a JSON file.

            :param bucket: Name of a GS bucket.
            :param key: GS file to upload.  e.g. 'output.txt' or 'data/output.txt'
            :return: A dictionary of metadata values.
            """
            metadata = dict()
            try:
                gs_bucket = self.gs_client.bucket(bucket,
                                                  self.google_project_id)
                blob_obj = gs_bucket.get_blob(key)
                metadata['content-type'] = blob_obj.content_type
                metadata['crc32c'] = binascii.hexlify(
                    base64.b64decode(blob_obj.crc32c)).decode("utf-8").lower()
                metadata['size'] = blob_obj.size
            except Exception as e:
                raise FileURLError(
                    f"Error accessing gs://{bucket}/{key}") from e
            return metadata

        def _consolidate_metadata(file_cloud_urls: set,
                                  s3_metadata: Optional[Dict[str, Any]],
                                  gs_metadata: Optional[Dict[str, Any]],
                                  guid: str) -> dict:
            """
            Consolidates cloud file metadata to create the JSON used to load by reference
            into the DSS.

            :param file_cloud_urls: A set of 'gs://' and 's3://' bucket URLs.
                                    e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
            :param s3_metadata: Dictionary of meta data produced by _get_s3_file_metadata().
            :param gs_metadata: Dictionary of meta data produced by _get_gs_file_metadata().
            :param guid: An optional additional/alternate data identifier/alias to associate with the file
            e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
            :return: A dictionary of cloud file metadata values
            """
            consolidated_metadata = dict()
            if s3_metadata:
                consolidated_metadata.update(s3_metadata)
            if gs_metadata:
                consolidated_metadata.update(gs_metadata)
            consolidated_metadata['url'] = list(file_cloud_urls)
            consolidated_metadata['aliases'] = [str(guid)]
            return consolidated_metadata

        if self.dry_run:
            logger.info(
                f"DRY RUN: upload_cloud_file_by_reference: {filename} {str(file_cloud_urls)} {bundle_uuid}"
            )

        file_reference = _create_file_reference(file_cloud_urls, guid)
        return self.upload_dict_as_file(
            file_reference,
            filename,
            file_uuid,
            bundle_uuid,
            file_version=file_version,
            content_type="application/json; dss-type=fileref")

    def upload_dict_as_file(
            self,
            value: dict,
            filename: str,
            file_uuid: str,
            bundle_uuid: str,
            file_version: str = None,  # RFC3339
            content_type=None):
        """
        Create a JSON file in the DSS containing the given dict.

        :param value: A dictionary representing the JSON content of the file to be created.
        :param filename: The basename of the file in the bucket.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param content_type: Content description e.g. "application/json; dss-type=fileref".
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        tempdir = mkdtemp()
        file_path = "/".join([tempdir, filename])
        with open(file_path, "w") as fh:
            fh.write(json.dumps(value, indent=4))
        result = self.upload_local_file(file_path,
                                        file_uuid,
                                        bundle_uuid,
                                        file_version=file_version,
                                        content_type=content_type)
        os.remove(file_path)
        os.rmdir(tempdir)
        return result

    def upload_local_file(self,
                          path: str,
                          file_uuid: str,
                          bundle_uuid: str,
                          file_version: str = None,
                          content_type=None):
        """
        Upload a file from the local file system to the DSS.

        :param path: Path to a local file.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param content_type: Content type identifier, for example: "application/json; dss-type=fileref".
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        file_uuid, key = self._upload_local_file_to_staging(
            path, file_uuid, content_type)
        return self._upload_tagged_cloud_file_to_dss_by_copy(
            self.staging_bucket,
            key,
            file_uuid,
            bundle_uuid,
            file_version=file_version)

    def load_bundle(self, file_info_list: list, bundle_uuid: str):
        """
        Loads a bundle to the DSS that contains the specified files.

        :param file_info_list:
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :return: A full qualified bundle id e.g. "{bundle_uuid}.{version}"
        """
        kwargs = dict(replica="aws",
                      creator_uid=CREATOR_ID,
                      files=file_info_list,
                      uuid=bundle_uuid,
                      version=tz_utc_now())
        if not self.dry_run:
            response = self.dss_client.put_bundle(**kwargs)
            version = response['version']
        else:
            logger.info("DRY RUN: DSS put bundle: " + str(kwargs))
            version = None
        bundle_fqid = f"{bundle_uuid}.{version}"
        logger.info(f"Loaded bundle: {bundle_fqid}")
        return bundle_fqid

    @staticmethod
    def get_filename_from_key(key: str):
        assert not key.endswith(
            '/'
        ), 'Please specify a filename, not a directory ({} cannot end in "/").'.format(
            key)
        return key.split("/")[-1]

    def _upload_local_file_to_staging(self, path: str, file_uuid: str,
                                      content_type):
        """
        Upload a local file to the staging bucket, computing the DSS-required checksums
        in the process, then tag the file in the staging bucket with the checksums.
        This is in preparation from subsequently uploading the file from the staging
        bucket into the DSS.

        :param path: Path to a local file.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file.
        :param content_type: Content description, for example: "application/json; dss-type=fileref".
        :return: file_uuid: str, key_name: str
        """
        def _encode_tags(tags):
            return [dict(Key=k, Value=v) for k, v in tags.items()]

        def _mime_type(filename):
            type_, encoding = mimetypes.guess_type(filename)
            if encoding:
                return encoding
            if type_:
                return type_
            return "application/octet-stream"

        file_size = os.path.getsize(path)
        multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(
            file_size)
        tx_cfg = TransferConfig(
            multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
            multipart_chunksize=multipart_chunksize)
        s3 = boto3.resource("s3")

        destination_bucket = s3.Bucket(self.staging_bucket)
        with open(path, "rb") as file_handle, ChecksummingBufferedReader(
                file_handle, multipart_chunksize) as fh:
            key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name))
            destination_bucket.upload_fileobj(
                fh,
                key_name,
                Config=tx_cfg,
                ExtraArgs={
                    'ContentType':
                    content_type
                    if content_type is not None else _mime_type(fh.raw.name)
                })
            sums = fh.get_checksums()
            metadata = {
                "hca-dss-s3_etag": sums["s3_etag"],
                "hca-dss-sha1": sums["sha1"],
                "hca-dss-sha256": sums["sha256"],
                "hca-dss-crc32c": sums["crc32c"],
            }

            s3.meta.client.put_object_tagging(
                Bucket=destination_bucket.name,
                Key=key_name,
                Tagging=dict(TagSet=_encode_tags(metadata)))
        return file_uuid, key_name

    def _upload_tagged_cloud_file_to_dss_by_copy(self,
                                                 source_bucket: str,
                                                 source_key: str,
                                                 file_uuid: str,
                                                 bundle_uuid: str,
                                                 file_version: str = None,
                                                 timeout_seconds=1200):
        """
        Uploads a tagged file contained in a cloud bucket to the DSS by copy.
        This is typically used to update a tagged file from a staging bucket into the DSS.

        :param source_bucket: Name of an S3 bucket.  e.g. 'commons-dss-upload'
        :param source_key: S3 file to upload.  e.g. 'output.txt' or 'data/output.txt'
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file.
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param file_version: a RFC3339 compliant datetime string
        :param timeout_seconds:  Amount of time to continue attempting an async copy.
        :return: file_uuid: str, file_version: str, filename: str, file_present: bool
        """
        source_url = f"s3://{source_bucket}/{source_key}"
        filename = self.get_filename_from_key(source_key)

        if self.dry_run:
            logger.info(
                f"DRY RUN: _upload_tagged_cloud_file_to_dss: {source_bucket} {source_key} {file_uuid} {bundle_uuid}"
            )
            return file_uuid, file_version, filename

        request_parameters = dict(uuid=file_uuid,
                                  version=file_version,
                                  bundle_uuid=bundle_uuid,
                                  creator_uid=CREATOR_ID,
                                  source_url=source_url)
        if self.dry_run:
            print("DRY RUN: put file: " + str(request_parameters))
            return file_uuid, file_version, filename

        copy_start_time = time.time()
        response = self.dss_client.put_file._request(request_parameters)

        # the version we get back here is formatted in the way DSS likes
        # and we need this format update when doing load bundle
        file_version = response.json().get('version', "blank")

        # from dss swagger docs:
        # 200 Returned when the file is already present and is identical to the file being uploaded.
        already_present = response.status_code == requests.codes.ok
        if response.status_code == requests.codes.ok:
            logger.info("File %s: Already exists -> %s (%d seconds)",
                        source_url, file_version,
                        (time.time() - copy_start_time))
        elif response.status_code == requests.codes.created:
            logger.info("File %s: Sync copy -> %s (%d seconds)", source_url,
                        file_version, (time.time() - copy_start_time))
        elif response.status_code == requests.codes.accepted:
            logger.info("File %s: Starting async copy -> %s", source_url,
                        file_version)

            timeout = time.time() + timeout_seconds
            wait = 1.0
            # TODO: busy wait could hopefully be replaced with asyncio
            while time.time() < timeout:
                try:
                    self.dss_client.head_file(uuid=file_uuid,
                                              replica="aws",
                                              version=file_version)
                    logger.info(
                        "File %s: Finished async copy -> %s (approximately %d seconds)",
                        source_url, file_version,
                        (time.time() - copy_start_time))
                    break
                except SwaggerAPIException as e:
                    if e.code != requests.codes.not_found:
                        msg = "File {}: Unexpected server response during registration"
                        raise RuntimeError(msg.format(source_url))
                    time.sleep(wait)
                    wait = min(10.0,
                               wait * self.dss_client.UPLOAD_BACKOFF_FACTOR)
            else:
                # timed out. :(
                raise RuntimeError(
                    "File {}: registration FAILED".format(source_url))
            logger.debug("Successfully uploaded file")
        else:
            raise UnexpectedResponseError(
                f'Received unexpected response code {response.status_code}')

        return file_uuid, file_version, filename, already_present
Esempio n. 25
0
from hca import HCAConfig
from hca.dss import DSSClient

hca_config = HCAConfig()
hca_config[
    "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
dss = DSSClient(config=hca_config)

# Creates a sub based given a replica and a url
subscription = dss.put_subscription(
    replica="aws",
    callback_url=
    " https://dcp-cli-tutorials-put-delete-get-sub-api.humancellatlas.org`")

callback, owner, replica, uuid = (
    subscription["callback_url"],
    subscription["owner"],
    subscription["replica"],
    subscription["uuid"],
)

# Lists all subs created
print(dss.get_subscriptions(replica="aws"))

# Lists a sub
print(dss.get_subscription(replica="aws", uuid=uuid))

# Deletes a sub based on a UUID
print(dss.delete_subscription(replica="aws", uuid=uuid))
from hca import HCAConfig
from hca.dss import DSSClient
import uuid
import os

hca_config = HCAConfig()
hca_config["DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
dss = DSSClient(config=hca_config)

# Creates a new collection
collection = dss.put_collection(
    uuid=str(uuid.uuid4()),
    version="2018-09-17T161441.564206Z",  # arbitrary
    description="foo",
    details={},
    replica="aws",
    name="bar",
    contents=[
        {
            "type": "bundle",
            "uuid": "ff818282-9735-45fa-a094-e9f2d3d0a954",  # overwrite if necessary
            "version": "2019-08-06T170839.843085Z",  # arbitrary
            "path": "https://dss.dev.data.humancellatlas.org/v1/bundles/ff818282-9735-45fa-a094-e9f2d3d0a954?version=2019-08-06T170839.843085Z&replica=aws",
        }
    ],
)

uuid, version = collection["uuid"], collection["version"]

# Gets a list of collections
print(dss.get_collections(replica="aws"))
Esempio n. 27
0
from hca import HCAConfig
from hca.dss import DSSClient
import os

hca_config = HCAConfig()

hca_config[
    "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
dss = DSSClient(config=hca_config)

dss.put_bundle(
    creator_uid=0,
    uuid="98f6c379-cb78-4a61-9310-f8cc0341c0ea",
    version="2019-08-02T202456.025543Z",
    replica="aws",
    files=[{
        "uuid": "2196a626-38da-4489-8b2f-645d342f6aab",
        "version": "2019-07-10T001103.121000Z",
        "name": "process_1.json1",
        "indexed": False,
    }],
)
Esempio n. 28
0
from hca import HCAConfig
from hca.dss import DSSClient

hca_config = HCAConfig()
hca_config[
    "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
dss = DSSClient(config=hca_config)

print(dss.get_collections())
Esempio n. 29
0
from hca import HCAConfig
from hca.dss import DSSClient

hca_config = HCAConfig()
hca_config[
    'DSSClient'].swagger_url = f'https://dss.dev.data.humancellatlas.org/v1/swagger.json'
dss = DSSClient(config=hca_config)
for i in dss.post_search.iterate(replica='aws', es_query={}):
    uuid, version = i['bundle_fqid'].split('.', 1)
    try:
        s = f'Bundle: {uuid}.{version}\n'
        for j in dss.get_bundle(replica='aws', uuid=uuid,
                                version=version)['bundle']['files']:
            file_version = j['version']
            file_uuid = j['uuid']
            s += f'    File: {file_uuid}.{file_version}\n'
        print(s[:-1])
        break
    except:
        pass  # print(f'Does not exist: {uuid}.{version}')
Esempio n. 30
0
        """Return the url of the dss."""
        return self._dss_url

    @property
    def indexer_url(self):
        """Return the url of the indexer."""
        return self._indexer_url

    @property
    def es_query(self):
        """Return the ElasticSearch query."""
        return self._es_query


default = DefaultProperties()
dss_client = DSSClient()

parser = argparse.ArgumentParser(
    description='Process options the finder of golden bundles.')
parser.add_argument('--dss-url',
                    dest='dss_url',
                    action='store',
                    default=default.dss_url,
                    help='The url for the storage system.')
parser.add_argument('--indexer-url',
                    dest='indexer_url',
                    action='store',
                    default=default.indexer_url,
                    help='The indexer URL')
parser.add_argument('--es-query',
                    dest='es_query',