def _patch_client_for_direct_access(client: DSSClient): old_get_file = client.get_file old_get_bundle = client.get_bundle mini_dss = MiniDSS(config.dss_endpoint) def new_get_file(self, uuid, replica, version=None): assert client is self try: blob = mini_dss.get_file(uuid, version, replica) except Exception: logger.warning( 'Failed getting file %s, version %s directly. ' 'Falling back to official method', uuid, version) return old_get_file(uuid=uuid, version=version, replica=replica) else: return blob class NewGetBundle: def paginate(self, *args, **kwargs): uuid, version, replica = kwargs['uuid'], kwargs['version'], kwargs[ 'replica'] try: bundle = mini_dss.get_bundle(uuid, version, replica) except Exception: logger.warning( 'Failed getting bundle file %s, version %s directly. ' 'Falling back to official method', uuid, version) return old_get_bundle.paginate(*args, **kwargs) else: page = {'bundle': bundle, 'version': version, 'uuid': uuid} return [page] new_get_bundle = NewGetBundle() client.get_file = types.MethodType(new_get_file, client) client.get_bundle = new_get_bundle
def test_swagger_client_no_refresh(self): """ Instantiates the normal DSSClient with a 3600 second expiration token so that we can check that it successfully uses the same token for both requests. """ dss = DSSClient( swagger_url= 'https://dss.dev.data.humancellatlas.org/v1/swagger.json') assert dss._authenticated_session is None # we use collections to test because it's an authenticated endpoint r = dss.get_collections() assert 'collections' in r token_one = dss._authenticated_session.token['access_token'] expires_at = dss._authenticated_session.token[ 'expires_at'] - time.time() assert expires_at < 3600 assert expires_at > 3590 time.sleep(2) r = dss.get_collections() assert 'collections' in r token_two = dss._authenticated_session.token['access_token'] expires_at = dss._authenticated_session.token[ 'expires_at'] - time.time() assert expires_at < 3600 assert expires_at > 3590 assert token_one == token_two # we used one long-lived token for both requests
def __init__(self, deployment): self.deployment = deployment if self.deployment == "prod": swagger_url = self.DSS_PROD_SWAGGER_URL else: swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format( deployment=deployment) self.client = DSSClient(swagger_url=swagger_url)
class DataStoreAgent: DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json" DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json" def __init__(self, deployment): self.deployment = deployment if self.deployment == "prod": swagger_url = self.DSS_PROD_SWAGGER_URL else: swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format( deployment=deployment) self.client = DSSClient(swagger_url=swagger_url) def search(self, query, replica='aws'): try: response = self.client.post_search(replica=replica, es_query=query) return response['results'] except SwaggerAPIException: return [] def search_iterate(self, query, replica='aws'): for hit in self.client.post_search.iterate(replica=replica, es_query=query): yield hit def download_bundle(self, bundle_uuid, target_folder): Progress.report(f"Downloading bundle {bundle_uuid}:\n") manifest = self.bundle_manifest(bundle_uuid) bundle_folder = os.path.join(target_folder, bundle_uuid) try: os.makedirs(bundle_folder) except FileExistsError: pass for f in manifest['bundle']['files']: self.download_file(f['uuid'], save_as=os.path.join(bundle_folder, f['name'])) return bundle_folder def bundle_manifest(self, bundle_uuid, replica='aws'): return self.client.get_bundle(replica=replica, uuid=bundle_uuid) def download_file(self, file_uuid, save_as, replica='aws'): Progress.report(f"Downloading file {file_uuid} to {save_as}\n") with self.client.get_file.stream(replica=replica, uuid=file_uuid) as fh: with open(save_as, "wb") as f: while True: chunk = fh.raw.read(1024) if chunk: f.write(chunk) else: break def tombstone_bundle(self, bundle_uuid, replica='aws'): self.client.delete_bundle(replica=replica, uuid=bundle_uuid, reason="DCP-wide integration test")
def _get_dss_client(): if stage == "prod": dss_client = DSSClient( swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") else: dss_client = DSSClient( swagger_url= f"https://dss.{stage}.data.humancellatlas.org/v1/swagger.json") return dss_client
def get_dss_generator(query): dss_client = DSSClient( swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") bundle_generator = dss_client.post_search.iterate(replica="aws", es_query=query, output_format="raw") total_hits = dss_client.post_search(replica="aws", es_query=query, output_format="raw").get('total_hits') return (bundle_generator, total_hits)
def manage_subscriptions(plugin: RepositoryPlugin, dss_client: DSSClient, subscribe=True): response = dss_client.get_subscriptions(replica='aws', subscription_type='elasticsearch') current_subscriptions = freeze(response['subscriptions']) key, key_id = deployment.aws.get_hmac_key_and_id() if subscribe: base_url = config.indexer_endpoint() prefix = config.dss_query_prefix new_subscriptions = [ freeze( dict(replica='aws', es_query=query(prefix), callback_url=furl(url=base_url, path=(config.default_catalog, action)), hmac_key_id=key_id)) for query, action in [( plugin.dss_subscription_query, 'add'), (plugin.dss_deletion_subscription_query, 'delete')] ] else: new_subscriptions = [] for subscription in current_subscriptions: # Note the use of <= to allow for the fact that DSS returns subscriptions with additional attributes, more # than were originally supplied. If the subscription returned by DSS is a superset of the subscription we want # to create, we can skip the update. matching_subscription = next( (new_subscription for new_subscription in new_subscriptions if new_subscription.items() <= subscription.items()), None) if matching_subscription: logger.info('Already subscribed: %r', thaw(subscription)) new_subscriptions.remove(matching_subscription) else: subscription = thaw(subscription) logger.info('Removing stale subscription: %r', subscription) dss_client.delete_subscription(uuid=subscription['uuid'], replica=subscription['replica'], subscription_type='elasticsearch') for subscription in new_subscriptions: subscription = thaw(subscription) response = dss_client.put_subscription(**subscription, hmac_secret_key=key) subscription['uuid'] = response['uuid'] logger.info('Registered subscription %r.', subscription)
def dss_client(deployment: Optional[str] = None) -> DSSClient: """ Return a DSS client to DSS production or the specified DSS deployment. :param deployment: The name of a DSS deployment like `dev`, `integration` or `staging`. If None, the production deployment (`prod`) will be used. """ # Work around https://github.com/HumanCellAtlas/dcp-cli/issues/142 hca_config = HCAConfig() deployment = deployment + "." if deployment else "" hca_config[ 'DSSClient'].swagger_url = f'https://dss.{deployment}data.humancellatlas.org/v1/swagger.json' # Clear the cached swagger specs that may come from a different deployment. This work-around isn't thread safe but # neither is the caching iteself. DSSClient._swagger_spec = None client = DSSClient(config=hca_config) client.timeout_policy = Timeout(connect=10, read=40) return client
def main(): logging.basicConfig(level=logging.INFO) hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) projects = get_target_project_dirs(follow_links=True) for project in projects: log.info('Uploading %s', project) bundle_uuid = project.name assert str(UUID(bundle_uuid)) == bundle_uuid bundle = project / 'bundle' def file_uuid_callback(file_path: str): file_path = Path(file_path) file_name = file_path.name file_uuid = generate_file_uuid(bundle_uuid, file_name) log.info('Allocated UUID %s for file %s', file_uuid, file_path) if file_name.endswith('.json'): with file_path.open('rt') as f: document = json.load(f) if file_name == 'links.json': pass elif file_name == 'project_0.json': assert document['provenance'][ 'document_id'] == bundle_uuid else: assert document['provenance'][ 'document_id'] == file_uuid return file_uuid if bundle.is_dir(): response = dss.upload(src_dir=str(bundle), replica='aws', staging_bucket='lon-test-data', bundle_uuid=bundle_uuid, file_uuid_callback=file_uuid_callback) print( f'Successful upload. Bundle information is:\n{json.dumps(response, indent=4)}' ) else: log.warning('Skipping %s because metadata is missing', project)
def _test_dss_client(self, direct: bool, query: JSON, dss_client: DSSClient, replica: str, fallback: bool): with self.subTest(direct=direct, replica=replica, fallback=fallback): response = dss_client.post_search(es_query=query, replica=replica, per_page=10) bundle_uuid, _, bundle_version = response['results'][0][ 'bundle_fqid'].partition('.') with mock.patch('azul.dss.logger') as captured_log: _, manifest, metadata = download_bundle_metadata( client=dss_client, replica=replica, uuid=bundle_uuid, version=bundle_version, num_workers=config.num_dss_workers) log.info('Captured log calls: %r', captured_log.mock_calls) self.assertGreater(len(metadata), 0) self.assertGreater(set(f['name'] for f in manifest), set(metadata.keys())) for f in manifest: self.assertIn('s3_etag', f) # Extract the log method name and the first three words of log # message logged. Note that the PyCharm debugger will call # certain dunder methods on the variable, leading to failed # assertions. actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3])) for m, a, k in captured_log.mock_calls] if direct: if replica == 'aws': if fallback: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing bundle'), ('warning', 'Failed getting bundle')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing file'), ('warning', 'Failed getting file') ] * len(metadata) else: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), # file ('debug', 'Loading object %s') # blob ] * len(metadata) else: # On `gcp` the precondition check fails right away, preventing any attempts of direct access expected = [ ('warning', 'Failed getting bundle') ] + [('warning', 'Failed getting file')] * len(metadata) else: expected = [] self.assertSequenceEqual(sorted(expected), sorted(actual))
def __init__(self, dss_host): """ Create an instance of the DataExtractor. It takes the formatted url of the DSS (e.g. https://dss.staging.data.humancellatlas.org/v1) to which to talk to. :param dss_host: The formatted url for the DSS """ self.dss_client = DSSClient() self.dss_client.host = dss_host self.log = logging.getLogger(indexer_name + ".indexer.DataExtractor")
def get_dss_generator(hca_project_uuid): # files.project_json.provenance.document_id project uuid you want to retreive # exists files.project_json.provenance.document_id removes test bundles # "files.analysis_process_json.type.text": "analysis" look at primary bundles only don't return analysis bundles query = { "query": { "bool": { "must": [{ "match": { "files.project_json.provenance.document_id": "" } }, { "exists": { "field": "files.project_json.provenance.document_id" } }], "must_not": [{ "match": { "files.analysis_process_json.type.text": "analysis" } }] } } } query.get("query").get("bool").get("must")[0].get("match")[ "files.project_json.provenance.document_id"] = hca_project_uuid dss_client = DSSClient( swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") bundle_generator = dss_client.post_search.iterate(replica="aws", es_query=query, output_format="raw") total_hits = dss_client.post_search(replica="aws", es_query=query, output_format="raw").get('total_hits') return (bundle_generator, total_hits)
def setUpClass(cls): super().setUpClass() cls.dss_endpoint = os.getenv("TEST_DSS_ENDPOINT", "https://hca-dss-4.ucsc-cgp-dev.org/v1") cls.staging_bucket = os.getenv('DSS_S3_STAGING_BUCKET', 'commons-dss-upload') # Work around problems with DSSClient initialization when there is # existing HCA configuration. The following issue has been submitted: # Problems accessing an alternate DSS from user scripts or unit tests #170 # https://github.com/HumanCellAtlas/dcp-cli/issues/170 monkey_patch_hca_config() HCAConfig._user_config_home = '/tmp/' dss_config = HCAConfig(name='loader-test', save_on_exit=False, autosave=False) dss_config['DSSClient'].swagger_url = f'{cls.dss_endpoint}/swagger.json' cls.dss_client = DSSClient(config=dss_config)
def __init__(self, dss_endpoint: str, staging_bucket: str, google_project_id: str, dry_run: bool) -> None: """ Functions for uploading files to a given DSS. :param dss_endpoint: The URL to a Swagger DSS API. e.g. "https://commons-dss.ucsc-cgp-dev.org/v1" :param staging_bucket: The name of the AWS S3 bucket to be used when staging files for uploading to the DSS. As an example, local files are uploaded to the staging bucket, then file metadata tags required by the DSS are assigned to it, then the file is loaded into the DSS (by copy). The bucket must be accessible by the DSS. .e.g. 'commons-dss-upload' :param google_project_id: A Google `Project ID` to be used when accessing GCP requester pays buckets. e.g. "platform-dev-178517" One way to find a `Project ID` is provided here: https://console.cloud.google.com/cloud-resource-manager :param dry_run: If True, log the actions that would be performed yet don't actually execute them. Otherwise, actually perform the operations. """ self.dss_endpoint = dss_endpoint self.staging_bucket = staging_bucket self.google_project_id = google_project_id self.dry_run = dry_run self.s3_client = boto3.client("s3") self.s3_blobstore = s3.S3BlobStore(self.s3_client) self.gs_client = Client() # Work around problems with DSSClient initialization when there is # existing HCA configuration. The following issue has been submitted: # Problems accessing an alternate DSS from user scripts or unit tests #170 # https://github.com/HumanCellAtlas/dcp-cli/issues/170 monkey_patch_hca_config() HCAConfig._user_config_home = '/tmp/' dss_config = HCAConfig(name='loader', save_on_exit=False, autosave=False) dss_config[ 'DSSClient'].swagger_url = f'{self.dss_endpoint}/swagger.json' self.dss_client = DSSClient(config=dss_config)
from hca.dss import DSSClient dss = DSSClient() dss.download( bundle_uuid="ffffaf55-f19c-40e3-aa81-a6c69d357265", version="2019-08-01T200147.836832Z", replica="aws", download_dir="download_test", )
def get_client(): global client if client is None: from hca.dss import DSSClient client = DSSClient() return client
from hca.dss import DSSClient import json dss = DSSClient() json_response = dss.get_file(replica="aws", uuid="666ff3f0-67a1-4ead-82e9-3f96a8c0a9b1") for content in json_response: print(f"{content}: {json.dumps(json_response[content], indent=4)}")
from hca.dss import DSSClient dss = DSSClient() UUID = "ffffaf55-f19c-40e3-aa81-a6c69d357265" VERSION = "ffffaf55-f19c-40e3-aa81-a6c69d357265" # Download the metadata only dss.download(bundle_uuid=UUID, version=VERSION, replica="aws", download_dir=".hca_metadata_only") # Download the data only dss.download(bundle_uuid=UUID, version=VERSION, replica="aws", download_dir=".hca_data_only")
from hca import HCAConfig from hca.dss import DSSClient hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) print( dss.put_file( uuid="ead6434d-efb5-4554-98bc-027e160547c5", version="2019-07-30T174916.268875Z", creator_uid=0, source_url= "s3://jeffwu-test/ead6434d-efb5-4554-98bc-027e160547c5/get_bundle.json", ))
from hca.dss import DSSClient dss = DSSClient() dss.create_version()
from hca.dss import DSSClient dss = DSSClient() checkout_id = dss.post_bundles_checkout( uuid="fff746b3-e3eb-496a-88a3-5fa1fa358392", replica="aws") print(checkout_id)
from hca.dss import DSSClient dss = DSSClient() # Note: # Passing es_query={} runs an empty search, which will match all bundles. # Iterable post_search for results in dss.post_search.iterate(replica="aws", es_query={}): print(results) break # Non-iterable (first page only) post_search print(dss.post_search(replica='aws', es_query={}))
def bundle_url_iterator(): dss_client = DSSClient(swagger_url="https://dss.data.humancellatlas.org/v1/swagger.json") q = { "query": { "bool": { "must": [ { "match": { "files.library_preparation_protocol_json.library_construction_approach.ontology": "EFO:0008931" } }, { "match": { "files.sequencing_protocol_json.paired_end": 'true' } }, { "match": { "files.donor_organism_json.biomaterial_core.ncbi_taxon_id": 9606 } } ], "should": [ { "match": { "files.dissociation_protocol_json.dissociation_method.ontology": "EFO:0009128" } }, { "match": { "files.dissociation_protocol_json.dissociation_method.text": "mouth pipette" } } ], "must_not": [ { "terms": { "files.project_json.provenance.document_id": [ "1630e3dc-5501-4faf-9726-2e2c0b4da6d7", "fd1d163d-d6a7-41cd-b3bc-9d77ba9a36fe", "2a0faf83-e342-4b1c-bb9b-cf1d1147f3bb", "cf8439db-fcc9-44a8-b66f-8ffbf729bffa", "6b9f514d-d738-403f-a9c2-62580bbe5c83", "311d013c-01e4-42c0-9c2d-25472afa9cbc", "d237ed6a-3a7f-4a91-b300-b070888a8542", "e6cc0b02-2125-4faa-9903-a9025a62efec", "e4dbcb98-0562-4071-8bea-5e8de5f3c147", "e79e9284-c337-4dfd-853d-66fa3facfbbd", "560cd061-9165-4699-bc6e-8253e164c079", "e83fda0e-6515-4f13-82cb-a5860ecfc2d4", "9a60e8c2-32ea-4586-bc1f-7ee58f462b07", "71a6e049-4846-4c2a-8823-cc193c573efc", "4b5a2268-507c-46e6-bab0-3efb30145e85", "364ebb73-652e-4d32-8938-1c922d0b2584", "11f5d59b-0e2c-4f01-85ac-8d8dd3db53be", "c1996526-6466-40ff-820f-dad4d63492ec", "c281dedc-e838-4464-bf51-1cc4efae3fb9", "40afcf6b-422a-47ba-ba7a-33678c949b5c", "71a6e049-4846-4c2a-8823-cc193c573efc", "9a60e8c2-32ea-4586-bc1f-7ee58f462b07", "0facfacd-5b0c-4228-8be5-37aa1f3a269d", "76c209df-42bf-41dc-a5f5-3d27193ca7a6", "bb409c34-bb87-4ed2-adaf-6d1ef10610b5", "1a6b5e5d-914f-4dd6-8817-a1f9b7f364d5", "dd401943-1059-4b2d-b187-7a9e11822f95" ] } } ] } } } q2 = { "query": { "bool": { "must": [ { "match": { "files.library_preparation_protocol_json.library_construction_approach.ontology": "EFO:0008931" } }, { "match": { "files.donor_organism_json.biomaterial_core.ncbi_taxon_id": 9606 } } ], "must_not": [ { "terms": { "files.project_json.provenance.document_id": [ "1630e3dc-5501-4faf-9726-2e2c0b4da6d7", "fd1d163d-d6a7-41cd-b3bc-9d77ba9a36fe", "2a0faf83-e342-4b1c-bb9b-cf1d1147f3bb", "cf8439db-fcc9-44a8-b66f-8ffbf729bffa", "6b9f514d-d738-403f-a9c2-62580bbe5c83", "311d013c-01e4-42c0-9c2d-25472afa9cbc", "d237ed6a-3a7f-4a91-b300-b070888a8542", "e6cc0b02-2125-4faa-9903-a9025a62efec", "e4dbcb98-0562-4071-8bea-5e8de5f3c147", "e79e9284-c337-4dfd-853d-66fa3facfbbd", "560cd061-9165-4699-bc6e-8253e164c079", "e83fda0e-6515-4f13-82cb-a5860ecfc2d4", "9a60e8c2-32ea-4586-bc1f-7ee58f462b07", "71a6e049-4846-4c2a-8823-cc193c573efc", "4b5a2268-507c-46e6-bab0-3efb30145e85", "364ebb73-652e-4d32-8938-1c922d0b2584", "11f5d59b-0e2c-4f01-85ac-8d8dd3db53be", "c1996526-6466-40ff-820f-dad4d63492ec", "c281dedc-e838-4464-bf51-1cc4efae3fb9", "40afcf6b-422a-47ba-ba7a-33678c949b5c", "71a6e049-4846-4c2a-8823-cc193c573efc", "9a60e8c2-32ea-4586-bc1f-7ee58f462b07", "0facfacd-5b0c-4228-8be5-37aa1f3a269d", "76c209df-42bf-41dc-a5f5-3d27193ca7a6", "bb409c34-bb87-4ed2-adaf-6d1ef10610b5", "1a6b5e5d-914f-4dd6-8817-a1f9b7f364d5", "dd401943-1059-4b2d-b187-7a9e11822f95" ] } } ] } } } # return dss_client.post_search.iterate(replica="aws", es_query=q2) #iterator of bundles return dss_client.post_search.iterate(replica="aws", es_query=q2, output_format="raw")
class DssUploader: def __init__(self, dss_endpoint: str, staging_bucket: str, google_project_id: str, dry_run: bool) -> None: """ Functions for uploading files to a given DSS. :param dss_endpoint: The URL to a Swagger DSS API. e.g. "https://commons-dss.ucsc-cgp-dev.org/v1" :param staging_bucket: The name of the AWS S3 bucket to be used when staging files for uploading to the DSS. As an example, local files are uploaded to the staging bucket, then file metadata tags required by the DSS are assigned to it, then the file is loaded into the DSS (by copy). The bucket must be accessible by the DSS. .e.g. 'commons-dss-upload' :param google_project_id: A Google `Project ID` to be used when accessing GCP requester pays buckets. e.g. "platform-dev-178517" One way to find a `Project ID` is provided here: https://console.cloud.google.com/cloud-resource-manager :param dry_run: If True, log the actions that would be performed yet don't actually execute them. Otherwise, actually perform the operations. """ self.dss_endpoint = dss_endpoint self.staging_bucket = staging_bucket self.google_project_id = google_project_id self.dry_run = dry_run self.s3_client = boto3.client("s3") self.s3_blobstore = s3.S3BlobStore(self.s3_client) self.gs_client = Client() # Work around problems with DSSClient initialization when there is # existing HCA configuration. The following issue has been submitted: # Problems accessing an alternate DSS from user scripts or unit tests #170 # https://github.com/HumanCellAtlas/dcp-cli/issues/170 monkey_patch_hca_config() HCAConfig._user_config_home = '/tmp/' dss_config = HCAConfig(name='loader', save_on_exit=False, autosave=False) dss_config[ 'DSSClient'].swagger_url = f'{self.dss_endpoint}/swagger.json' self.dss_client = DSSClient(config=dss_config) def upload_cloud_file_by_reference(self, filename: str, file_uuid: str, file_cloud_urls: set, bundle_uuid: str, guid: str, file_version: str = None) -> tuple: """ Loads the given cloud file into the DSS by reference, rather than by copying it into the DSS. Because the HCA DSS per se does not support loading by reference, this is currently implemented using the approach described here: https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78 This is conceptually similar to creating a "symbolic link" to the cloud file rather than copying the source file into the DSS. The file's metadata is obtained, formatted as a dictionary, then this dictionary is uploaded as as a json file with content type `dss-type=fileref` into the DSS. A request has been made for the HCA data-store to support loading by reference as a feature of the data store, here: https://github.com/HumanCellAtlas/data-store/issues/912 :param filename: The name of the file in the bucket. :param file_uuid: An RFC4122-compliant UUID to be used to identify the file :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links. e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'} :param bundle_uuid: n RFC4122-compliant UUID to be used to identify the bundle containing the file :param guid: An optional additional/alternate data identifier/alias to associate with the file e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d" :param file_version: a RFC3339 compliant datetime string :return: file_uuid: str, file_version: str, filename: str, already_present: bool """ def _create_file_reference(file_cloud_urls: set, guid: str) -> dict: """ Format a file's metadata into a dictionary for uploading as a json to support the approach described here: https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78 :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links. e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'} :param guid: An optional additional/alternate data identifier/alias to associate with the file e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d" :param file_version: RFC3339 formatted timestamp. :return: A dictionary of metadata values. """ s3_metadata = None gs_metadata = None for cloud_url in file_cloud_urls: url = urlparse(cloud_url) bucket = url.netloc key = url.path[1:] if url.scheme == "s3": s3_metadata = _get_s3_file_metadata(bucket, key) elif url.scheme == "gs": gs_metadata = _get_gs_file_metadata(bucket, key) else: raise FileURLError( "Unsupported cloud URL scheme: {cloud_url}") return _consolidate_metadata(file_cloud_urls, s3_metadata, gs_metadata, guid) def _get_s3_file_metadata(bucket: str, key: str) -> dict: """ Format an S3 file's metadata into a dictionary for uploading as a json. :param bucket: Name of an S3 bucket :param key: S3 file to upload. e.g. 'output.txt' or 'data/output.txt' :return: A dictionary of metadata values. """ metadata = dict() try: response = self.s3_client.head_object(Bucket=bucket, Key=key, RequestPayer="requester") metadata['content-type'] = response['ContentType'] metadata['s3_etag'] = response['ETag'] metadata['size'] = response['ContentLength'] except Exception as e: raise FileURLError( f"Error accessing s3://{bucket}/{key}") from e return metadata def _get_gs_file_metadata(bucket: str, key: str) -> dict: """ Format a GS file's metadata into a dictionary for uploading as a JSON file. :param bucket: Name of a GS bucket. :param key: GS file to upload. e.g. 'output.txt' or 'data/output.txt' :return: A dictionary of metadata values. """ metadata = dict() try: gs_bucket = self.gs_client.bucket(bucket, self.google_project_id) blob_obj = gs_bucket.get_blob(key) metadata['content-type'] = blob_obj.content_type metadata['crc32c'] = binascii.hexlify( base64.b64decode(blob_obj.crc32c)).decode("utf-8").lower() metadata['size'] = blob_obj.size except Exception as e: raise FileURLError( f"Error accessing gs://{bucket}/{key}") from e return metadata def _consolidate_metadata(file_cloud_urls: set, s3_metadata: Optional[Dict[str, Any]], gs_metadata: Optional[Dict[str, Any]], guid: str) -> dict: """ Consolidates cloud file metadata to create the JSON used to load by reference into the DSS. :param file_cloud_urls: A set of 'gs://' and 's3://' bucket URLs. e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'} :param s3_metadata: Dictionary of meta data produced by _get_s3_file_metadata(). :param gs_metadata: Dictionary of meta data produced by _get_gs_file_metadata(). :param guid: An optional additional/alternate data identifier/alias to associate with the file e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d" :return: A dictionary of cloud file metadata values """ consolidated_metadata = dict() if s3_metadata: consolidated_metadata.update(s3_metadata) if gs_metadata: consolidated_metadata.update(gs_metadata) consolidated_metadata['url'] = list(file_cloud_urls) consolidated_metadata['aliases'] = [str(guid)] return consolidated_metadata if self.dry_run: logger.info( f"DRY RUN: upload_cloud_file_by_reference: {filename} {str(file_cloud_urls)} {bundle_uuid}" ) file_reference = _create_file_reference(file_cloud_urls, guid) return self.upload_dict_as_file( file_reference, filename, file_uuid, bundle_uuid, file_version=file_version, content_type="application/json; dss-type=fileref") def upload_dict_as_file( self, value: dict, filename: str, file_uuid: str, bundle_uuid: str, file_version: str = None, # RFC3339 content_type=None): """ Create a JSON file in the DSS containing the given dict. :param value: A dictionary representing the JSON content of the file to be created. :param filename: The basename of the file in the bucket. :param file_uuid: An RFC4122-compliant UUID to be used to identify the file :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file :param content_type: Content description e.g. "application/json; dss-type=fileref". :param file_version: a RFC3339 compliant datetime string :return: file_uuid: str, file_version: str, filename: str, already_present: bool """ tempdir = mkdtemp() file_path = "/".join([tempdir, filename]) with open(file_path, "w") as fh: fh.write(json.dumps(value, indent=4)) result = self.upload_local_file(file_path, file_uuid, bundle_uuid, file_version=file_version, content_type=content_type) os.remove(file_path) os.rmdir(tempdir) return result def upload_local_file(self, path: str, file_uuid: str, bundle_uuid: str, file_version: str = None, content_type=None): """ Upload a file from the local file system to the DSS. :param path: Path to a local file. :param file_uuid: An RFC4122-compliant UUID to be used to identify the file :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file :param content_type: Content type identifier, for example: "application/json; dss-type=fileref". :param file_version: a RFC3339 compliant datetime string :return: file_uuid: str, file_version: str, filename: str, already_present: bool """ file_uuid, key = self._upload_local_file_to_staging( path, file_uuid, content_type) return self._upload_tagged_cloud_file_to_dss_by_copy( self.staging_bucket, key, file_uuid, bundle_uuid, file_version=file_version) def load_bundle(self, file_info_list: list, bundle_uuid: str): """ Loads a bundle to the DSS that contains the specified files. :param file_info_list: :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file :return: A full qualified bundle id e.g. "{bundle_uuid}.{version}" """ kwargs = dict(replica="aws", creator_uid=CREATOR_ID, files=file_info_list, uuid=bundle_uuid, version=tz_utc_now()) if not self.dry_run: response = self.dss_client.put_bundle(**kwargs) version = response['version'] else: logger.info("DRY RUN: DSS put bundle: " + str(kwargs)) version = None bundle_fqid = f"{bundle_uuid}.{version}" logger.info(f"Loaded bundle: {bundle_fqid}") return bundle_fqid @staticmethod def get_filename_from_key(key: str): assert not key.endswith( '/' ), 'Please specify a filename, not a directory ({} cannot end in "/").'.format( key) return key.split("/")[-1] def _upload_local_file_to_staging(self, path: str, file_uuid: str, content_type): """ Upload a local file to the staging bucket, computing the DSS-required checksums in the process, then tag the file in the staging bucket with the checksums. This is in preparation from subsequently uploading the file from the staging bucket into the DSS. :param path: Path to a local file. :param file_uuid: An RFC4122-compliant UUID to be used to identify the file. :param content_type: Content description, for example: "application/json; dss-type=fileref". :return: file_uuid: str, key_name: str """ def _encode_tags(tags): return [dict(Key=k, Value=v) for k, v in tags.items()] def _mime_type(filename): type_, encoding = mimetypes.guess_type(filename) if encoding: return encoding if type_: return type_ return "application/octet-stream" file_size = os.path.getsize(path) multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size( file_size) tx_cfg = TransferConfig( multipart_threshold=s3_multipart.MULTIPART_THRESHOLD, multipart_chunksize=multipart_chunksize) s3 = boto3.resource("s3") destination_bucket = s3.Bucket(self.staging_bucket) with open(path, "rb") as file_handle, ChecksummingBufferedReader( file_handle, multipart_chunksize) as fh: key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name)) destination_bucket.upload_fileobj( fh, key_name, Config=tx_cfg, ExtraArgs={ 'ContentType': content_type if content_type is not None else _mime_type(fh.raw.name) }) sums = fh.get_checksums() metadata = { "hca-dss-s3_etag": sums["s3_etag"], "hca-dss-sha1": sums["sha1"], "hca-dss-sha256": sums["sha256"], "hca-dss-crc32c": sums["crc32c"], } s3.meta.client.put_object_tagging( Bucket=destination_bucket.name, Key=key_name, Tagging=dict(TagSet=_encode_tags(metadata))) return file_uuid, key_name def _upload_tagged_cloud_file_to_dss_by_copy(self, source_bucket: str, source_key: str, file_uuid: str, bundle_uuid: str, file_version: str = None, timeout_seconds=1200): """ Uploads a tagged file contained in a cloud bucket to the DSS by copy. This is typically used to update a tagged file from a staging bucket into the DSS. :param source_bucket: Name of an S3 bucket. e.g. 'commons-dss-upload' :param source_key: S3 file to upload. e.g. 'output.txt' or 'data/output.txt' :param file_uuid: An RFC4122-compliant UUID to be used to identify the file. :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file :param file_version: a RFC3339 compliant datetime string :param timeout_seconds: Amount of time to continue attempting an async copy. :return: file_uuid: str, file_version: str, filename: str, file_present: bool """ source_url = f"s3://{source_bucket}/{source_key}" filename = self.get_filename_from_key(source_key) if self.dry_run: logger.info( f"DRY RUN: _upload_tagged_cloud_file_to_dss: {source_bucket} {source_key} {file_uuid} {bundle_uuid}" ) return file_uuid, file_version, filename request_parameters = dict(uuid=file_uuid, version=file_version, bundle_uuid=bundle_uuid, creator_uid=CREATOR_ID, source_url=source_url) if self.dry_run: print("DRY RUN: put file: " + str(request_parameters)) return file_uuid, file_version, filename copy_start_time = time.time() response = self.dss_client.put_file._request(request_parameters) # the version we get back here is formatted in the way DSS likes # and we need this format update when doing load bundle file_version = response.json().get('version', "blank") # from dss swagger docs: # 200 Returned when the file is already present and is identical to the file being uploaded. already_present = response.status_code == requests.codes.ok if response.status_code == requests.codes.ok: logger.info("File %s: Already exists -> %s (%d seconds)", source_url, file_version, (time.time() - copy_start_time)) elif response.status_code == requests.codes.created: logger.info("File %s: Sync copy -> %s (%d seconds)", source_url, file_version, (time.time() - copy_start_time)) elif response.status_code == requests.codes.accepted: logger.info("File %s: Starting async copy -> %s", source_url, file_version) timeout = time.time() + timeout_seconds wait = 1.0 # TODO: busy wait could hopefully be replaced with asyncio while time.time() < timeout: try: self.dss_client.head_file(uuid=file_uuid, replica="aws", version=file_version) logger.info( "File %s: Finished async copy -> %s (approximately %d seconds)", source_url, file_version, (time.time() - copy_start_time)) break except SwaggerAPIException as e: if e.code != requests.codes.not_found: msg = "File {}: Unexpected server response during registration" raise RuntimeError(msg.format(source_url)) time.sleep(wait) wait = min(10.0, wait * self.dss_client.UPLOAD_BACKOFF_FACTOR) else: # timed out. :( raise RuntimeError( "File {}: registration FAILED".format(source_url)) logger.debug("Successfully uploaded file") else: raise UnexpectedResponseError( f'Received unexpected response code {response.status_code}') return file_uuid, file_version, filename, already_present
from hca import HCAConfig from hca.dss import DSSClient hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) # Creates a sub based given a replica and a url subscription = dss.put_subscription( replica="aws", callback_url= " https://dcp-cli-tutorials-put-delete-get-sub-api.humancellatlas.org`") callback, owner, replica, uuid = ( subscription["callback_url"], subscription["owner"], subscription["replica"], subscription["uuid"], ) # Lists all subs created print(dss.get_subscriptions(replica="aws")) # Lists a sub print(dss.get_subscription(replica="aws", uuid=uuid)) # Deletes a sub based on a UUID print(dss.delete_subscription(replica="aws", uuid=uuid))
from hca import HCAConfig from hca.dss import DSSClient import uuid import os hca_config = HCAConfig() hca_config["DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) # Creates a new collection collection = dss.put_collection( uuid=str(uuid.uuid4()), version="2018-09-17T161441.564206Z", # arbitrary description="foo", details={}, replica="aws", name="bar", contents=[ { "type": "bundle", "uuid": "ff818282-9735-45fa-a094-e9f2d3d0a954", # overwrite if necessary "version": "2019-08-06T170839.843085Z", # arbitrary "path": "https://dss.dev.data.humancellatlas.org/v1/bundles/ff818282-9735-45fa-a094-e9f2d3d0a954?version=2019-08-06T170839.843085Z&replica=aws", } ], ) uuid, version = collection["uuid"], collection["version"] # Gets a list of collections print(dss.get_collections(replica="aws"))
from hca import HCAConfig from hca.dss import DSSClient import os hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) dss.put_bundle( creator_uid=0, uuid="98f6c379-cb78-4a61-9310-f8cc0341c0ea", version="2019-08-02T202456.025543Z", replica="aws", files=[{ "uuid": "2196a626-38da-4489-8b2f-645d342f6aab", "version": "2019-07-10T001103.121000Z", "name": "process_1.json1", "indexed": False, }], )
from hca import HCAConfig from hca.dss import DSSClient hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) print(dss.get_collections())
from hca import HCAConfig from hca.dss import DSSClient hca_config = HCAConfig() hca_config[ 'DSSClient'].swagger_url = f'https://dss.dev.data.humancellatlas.org/v1/swagger.json' dss = DSSClient(config=hca_config) for i in dss.post_search.iterate(replica='aws', es_query={}): uuid, version = i['bundle_fqid'].split('.', 1) try: s = f'Bundle: {uuid}.{version}\n' for j in dss.get_bundle(replica='aws', uuid=uuid, version=version)['bundle']['files']: file_version = j['version'] file_uuid = j['uuid'] s += f' File: {file_uuid}.{file_version}\n' print(s[:-1]) break except: pass # print(f'Does not exist: {uuid}.{version}')
"""Return the url of the dss.""" return self._dss_url @property def indexer_url(self): """Return the url of the indexer.""" return self._indexer_url @property def es_query(self): """Return the ElasticSearch query.""" return self._es_query default = DefaultProperties() dss_client = DSSClient() parser = argparse.ArgumentParser( description='Process options the finder of golden bundles.') parser.add_argument('--dss-url', dest='dss_url', action='store', default=default.dss_url, help='The url for the storage system.') parser.add_argument('--indexer-url', dest='indexer_url', action='store', default=default.indexer_url, help='The indexer URL') parser.add_argument('--es-query', dest='es_query',