Beispiel #1
0
class DataStoreAgent:
    DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json"
    DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json"

    def __init__(self, deployment):
        self.deployment = deployment
        if self.deployment == "prod":
            swagger_url = self.DSS_PROD_SWAGGER_URL
        else:
            swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format(
                deployment=deployment)
        self.client = DSSClient(swagger_url=swagger_url)

    def search(self, query, replica='aws'):
        try:
            response = self.client.post_search(replica=replica, es_query=query)
            return response['results']
        except SwaggerAPIException:
            return []

    def search_iterate(self, query, replica='aws'):
        for hit in self.client.post_search.iterate(replica=replica,
                                                   es_query=query):
            yield hit

    def download_bundle(self, bundle_uuid, target_folder):
        Progress.report(f"Downloading bundle {bundle_uuid}:\n")
        manifest = self.bundle_manifest(bundle_uuid)
        bundle_folder = os.path.join(target_folder, bundle_uuid)
        try:
            os.makedirs(bundle_folder)
        except FileExistsError:
            pass

        for f in manifest['bundle']['files']:
            self.download_file(f['uuid'],
                               save_as=os.path.join(bundle_folder, f['name']))
        return bundle_folder

    def bundle_manifest(self, bundle_uuid, replica='aws'):
        return self.client.get_bundle(replica=replica, uuid=bundle_uuid)

    def download_file(self, file_uuid, save_as, replica='aws'):
        Progress.report(f"Downloading file {file_uuid} to {save_as}\n")
        with self.client.get_file.stream(replica=replica,
                                         uuid=file_uuid) as fh:
            with open(save_as, "wb") as f:
                while True:
                    chunk = fh.raw.read(1024)
                    if chunk:
                        f.write(chunk)
                    else:
                        break

    def tombstone_bundle(self, bundle_uuid, replica='aws'):
        self.client.delete_bundle(replica=replica,
                                  uuid=bundle_uuid,
                                  reason="DCP-wide integration test")
def get_dss_generator(query):
    dss_client = DSSClient(
        swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    bundle_generator = dss_client.post_search.iterate(replica="aws",
                                                      es_query=query,
                                                      output_format="raw")
    total_hits = dss_client.post_search(replica="aws",
                                        es_query=query,
                                        output_format="raw").get('total_hits')
    return (bundle_generator, total_hits)
Beispiel #3
0
    def _test_dss_client(self, direct: bool, query: JSON,
                         dss_client: DSSClient, replica: str, fallback: bool):
        with self.subTest(direct=direct, replica=replica, fallback=fallback):
            response = dss_client.post_search(es_query=query,
                                              replica=replica,
                                              per_page=10)
            bundle_uuid, _, bundle_version = response['results'][0][
                'bundle_fqid'].partition('.')
            with mock.patch('azul.dss.logger') as captured_log:
                _, manifest, metadata = download_bundle_metadata(
                    client=dss_client,
                    replica=replica,
                    uuid=bundle_uuid,
                    version=bundle_version,
                    num_workers=config.num_dss_workers)
            log.info('Captured log calls: %r', captured_log.mock_calls)
            self.assertGreater(len(metadata), 0)
            self.assertGreater(set(f['name'] for f in manifest),
                               set(metadata.keys()))
            for f in manifest:
                self.assertIn('s3_etag', f)
            # Extract the log method name and the first three words of log
            # message logged. Note that the PyCharm debugger will call
            # certain dunder methods on the variable, leading to failed
            # assertions.
            actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3]))
                      for m, a, k in captured_log.mock_calls]
            if direct:
                if replica == 'aws':
                    if fallback:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s'),
                                    ('warning', 'Error accessing bundle'),
                                    ('warning', 'Failed getting bundle')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),
                                        ('warning', 'Error accessing file'),
                                        ('warning', 'Failed getting file')
                                    ] * len(metadata)
                    else:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),  # file
                                        ('debug', 'Loading object %s')  # blob
                                    ] * len(metadata)

                else:
                    # On `gcp` the precondition check fails right away, preventing any attempts of direct access
                    expected = [
                        ('warning', 'Failed getting bundle')
                    ] + [('warning', 'Failed getting file')] * len(metadata)
            else:
                expected = []
            self.assertSequenceEqual(sorted(expected), sorted(actual))
Beispiel #4
0
def get_dss_generator(hca_project_uuid):
    # files.project_json.provenance.document_id project uuid you want to retreive
    # exists files.project_json.provenance.document_id removes test bundles
    # "files.analysis_process_json.type.text": "analysis" look at primary bundles only don't return analysis bundles

    query = {
        "query": {
            "bool": {
                "must": [{
                    "match": {
                        "files.project_json.provenance.document_id": ""
                    }
                }, {
                    "exists": {
                        "field": "files.project_json.provenance.document_id"
                    }
                }],
                "must_not": [{
                    "match": {
                        "files.analysis_process_json.type.text": "analysis"
                    }
                }]
            }
        }
    }

    query.get("query").get("bool").get("must")[0].get("match")[
        "files.project_json.provenance.document_id"] = hca_project_uuid

    dss_client = DSSClient(
        swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json")
    bundle_generator = dss_client.post_search.iterate(replica="aws",
                                                      es_query=query,
                                                      output_format="raw")
    total_hits = dss_client.post_search(replica="aws",
                                        es_query=query,
                                        output_format="raw").get('total_hits')
    return (bundle_generator, total_hits)
Beispiel #5
0
from hca.dss import DSSClient

dss = DSSClient()

# Note:
# Passing es_query={} runs an empty search, which will match all bundles.

# Iterable post_search
for results in dss.post_search.iterate(replica="aws", es_query={}):
    print(results)
    break

# Non-iterable (first page only) post_search
print(dss.post_search(replica='aws', es_query={}))