class DataStoreAgent: DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json" DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json" def __init__(self, deployment): self.deployment = deployment if self.deployment == "prod": swagger_url = self.DSS_PROD_SWAGGER_URL else: swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format( deployment=deployment) self.client = DSSClient(swagger_url=swagger_url) def search(self, query, replica='aws'): try: response = self.client.post_search(replica=replica, es_query=query) return response['results'] except SwaggerAPIException: return [] def search_iterate(self, query, replica='aws'): for hit in self.client.post_search.iterate(replica=replica, es_query=query): yield hit def download_bundle(self, bundle_uuid, target_folder): Progress.report(f"Downloading bundle {bundle_uuid}:\n") manifest = self.bundle_manifest(bundle_uuid) bundle_folder = os.path.join(target_folder, bundle_uuid) try: os.makedirs(bundle_folder) except FileExistsError: pass for f in manifest['bundle']['files']: self.download_file(f['uuid'], save_as=os.path.join(bundle_folder, f['name'])) return bundle_folder def bundle_manifest(self, bundle_uuid, replica='aws'): return self.client.get_bundle(replica=replica, uuid=bundle_uuid) def download_file(self, file_uuid, save_as, replica='aws'): Progress.report(f"Downloading file {file_uuid} to {save_as}\n") with self.client.get_file.stream(replica=replica, uuid=file_uuid) as fh: with open(save_as, "wb") as f: while True: chunk = fh.raw.read(1024) if chunk: f.write(chunk) else: break def tombstone_bundle(self, bundle_uuid, replica='aws'): self.client.delete_bundle(replica=replica, uuid=bundle_uuid, reason="DCP-wide integration test")
def get_dss_generator(query): dss_client = DSSClient( swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") bundle_generator = dss_client.post_search.iterate(replica="aws", es_query=query, output_format="raw") total_hits = dss_client.post_search(replica="aws", es_query=query, output_format="raw").get('total_hits') return (bundle_generator, total_hits)
def _test_dss_client(self, direct: bool, query: JSON, dss_client: DSSClient, replica: str, fallback: bool): with self.subTest(direct=direct, replica=replica, fallback=fallback): response = dss_client.post_search(es_query=query, replica=replica, per_page=10) bundle_uuid, _, bundle_version = response['results'][0][ 'bundle_fqid'].partition('.') with mock.patch('azul.dss.logger') as captured_log: _, manifest, metadata = download_bundle_metadata( client=dss_client, replica=replica, uuid=bundle_uuid, version=bundle_version, num_workers=config.num_dss_workers) log.info('Captured log calls: %r', captured_log.mock_calls) self.assertGreater(len(metadata), 0) self.assertGreater(set(f['name'] for f in manifest), set(metadata.keys())) for f in manifest: self.assertIn('s3_etag', f) # Extract the log method name and the first three words of log # message logged. Note that the PyCharm debugger will call # certain dunder methods on the variable, leading to failed # assertions. actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3])) for m, a, k in captured_log.mock_calls] if direct: if replica == 'aws': if fallback: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing bundle'), ('warning', 'Failed getting bundle')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing file'), ('warning', 'Failed getting file') ] * len(metadata) else: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), # file ('debug', 'Loading object %s') # blob ] * len(metadata) else: # On `gcp` the precondition check fails right away, preventing any attempts of direct access expected = [ ('warning', 'Failed getting bundle') ] + [('warning', 'Failed getting file')] * len(metadata) else: expected = [] self.assertSequenceEqual(sorted(expected), sorted(actual))
def get_dss_generator(hca_project_uuid): # files.project_json.provenance.document_id project uuid you want to retreive # exists files.project_json.provenance.document_id removes test bundles # "files.analysis_process_json.type.text": "analysis" look at primary bundles only don't return analysis bundles query = { "query": { "bool": { "must": [{ "match": { "files.project_json.provenance.document_id": "" } }, { "exists": { "field": "files.project_json.provenance.document_id" } }], "must_not": [{ "match": { "files.analysis_process_json.type.text": "analysis" } }] } } } query.get("query").get("bool").get("must")[0].get("match")[ "files.project_json.provenance.document_id"] = hca_project_uuid dss_client = DSSClient( swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") bundle_generator = dss_client.post_search.iterate(replica="aws", es_query=query, output_format="raw") total_hits = dss_client.post_search(replica="aws", es_query=query, output_format="raw").get('total_hits') return (bundle_generator, total_hits)
from hca.dss import DSSClient dss = DSSClient() # Note: # Passing es_query={} runs an empty search, which will match all bundles. # Iterable post_search for results in dss.post_search.iterate(replica="aws", es_query={}): print(results) break # Non-iterable (first page only) post_search print(dss.post_search(replica='aws', es_query={}))