def _upload_bundle(self, replica, uuid=None): if replica == Replica.aws: test_fixtures_bucket = get_env('DSS_S3_BUCKET_TEST_FIXTURES') else: test_fixtures_bucket = get_env('DSS_GS_BUCKET_TEST_FIXTURES') bundle_uuid = uuid if uuid else str(uuid4()) file_uuid_1 = str(uuid4()) file_uuid_2 = str(uuid4()) filenames = ["file_1", "file_2"] resp_obj_1 = self.upload_file_wait( f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/0", replica, file_uuid_1, bundle_uuid=bundle_uuid, ) resp_obj_2 = self.upload_file_wait( f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/1", replica, file_uuid_2, bundle_uuid=bundle_uuid, ) file_version_1 = resp_obj_1.json['version'] file_version_2 = resp_obj_2.json['version'] bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid_1, file_version_1, filenames[0]), (file_uuid_2, file_version_2, filenames[1])], bundle_version, ) return bundle_uuid, bundle_version
def setUp(self): self.test_bucket = infra.get_env("S3_BUCKET") self.test_fixtures_bucket = infra.get_env("S3_BUCKET_FIXTURES") self.test_us_east_1_bucket = infra.get_env("S3_BUCKET_US_EAST_1") self.test_non_us_east_1_bucket = infra.get_env( "S3_BUCKET_NON_US_EAST_1") self.handle = S3BlobStore.from_environment()
def test_queue_notification(self): replica = Replica.aws bucket = get_env('DSS_S3_BUCKET_TEST') key = f"notification-v2/{uuid4()}" post = self.s3.generate_presigned_post( Bucket=bucket, Key=key, ExpiresIn=60, Fields={'Content-Type': "application/json"}, Conditions=[{ 'Content-Type': "application/json" }]) subscription = self._put_subscription( { 'payload_form_field': "file", 'form_fields': post['fields'], 'callback_url': post['url'], 'encoding': "multipart/form-data", }, replica) with SQSMessenger(get_queue_url( notify_v2.notification_queue_name)) as mq: msg = notify_v2._format_sqs_message( replica, subscription, "CREATE", "bundles/a47b90b2-0967-4fbf-87bc-c6c12db3fedf.2017-07-12T055120.037644Z", ) mq.send(msg, delay_seconds=0) notification = self._get_notification_from_s3_object(bucket, key) self.assertEquals(notification['subscription_id'], subscription['uuid'])
def upload_file(self, contents): s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") src_key = generate_test_key() s3 = boto3.resource('s3') with io.BytesIO(json.dumps( contents).encode()) as fh, ChecksummingSink() as sink: sink.write(fh.read()) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower() } fh.seek(0) # TODO: consider switching to unmanaged uploader (putobject w/blob) s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj( fh, ExtraArgs={"Metadata": metadata}) source_url = f"s3://{s3_test_bucket}/{src_key}" file_uuid = str(uuid4()) version = datetime_to_version_format(datetime.utcnow()) urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid) urlbuilder.add_query("version", version) resp_obj = self.assertPutResponse(str(urlbuilder), requests.codes.created, json_request_body=dict( creator_uid=0, source_url=source_url)) return file_uuid, resp_obj.json["version"]
def test_large_copy(self, num_parts=LAMBDA_PARALLELIZATION_FACTOR + 1): test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") test_src_key = infra.generate_test_key() s3_client = boto3.client("s3") mpu = s3_client.create_multipart_upload(Bucket=test_bucket, Key=test_src_key) with ThreadPoolExecutor(max_workers=8) as tpe: parts_futures = tpe.map( lambda part_id: TestS3ParallelCopy.upload_part( test_bucket, test_src_key, mpu['UploadId'], part_id), range(1, num_parts + 1)) parts = [ dict(ETag=part_etag, PartNumber=part_id) for part_id, part_etag in parts_futures ] src_etag = s3_client.complete_multipart_upload( Bucket=test_bucket, Key=test_src_key, MultipartUpload=dict(Parts=parts), UploadId=mpu['UploadId'], )['ETag'].strip('"') test_dst_key = infra.generate_test_key() state = s3copyclient.copy_sfn_event(test_bucket, test_src_key, test_bucket, test_dst_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}", execution_id, state) self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
def upload_file(app, contents, replica): src_key = generate_test_key() encoded = json.dumps(contents).encode() chunk_size = get_s3_multipart_chunk_size(len(encoded)) with io.BytesIO(encoded) as fh, ChecksummingSink( write_chunk_size=chunk_size) as sink: sink.write(fh.read()) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower() } fh.seek(0) if replica == 'gcp': gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") gcp_client = gs_storage.Client.from_service_account_json( os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) gs_bucket = gcp_client.bucket(gs_test_bucket) blob = gs_bucket.blob(src_key) blob.upload_from_file(fh, content_type="application/json") blob.metadata = metadata blob.patch() source_url = f"gs://{gs_test_bucket}/{src_key}" if replica == 'aws': # TODO: consider switching to unmanaged uploader (putobject w/blob) s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") s3 = boto3.resource('s3') s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj( fh, ExtraArgs={"Metadata": metadata}) source_url = f"s3://{s3_test_bucket}/{src_key}" file_uuid = str(uuid4()) version = datetime_to_version_format(datetime.utcnow()) urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid) urlbuilder.add_query("version", version) resp_obj = app.put(str(urlbuilder), json=dict(creator_uid=0, source_url=source_url), headers=get_auth_header()) resp_obj.raise_for_status() return file_uuid, resp_obj.json()["version"]
def setUp(self): dss.Config.set_config(dss.BucketConfig.TEST) self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES") self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES") self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") self.s3_test_checkout_bucket = get_env("DSS_S3_CHECKOUT_BUCKET_TEST") self.gs_test_checkout_bucket = get_env("DSS_GS_CHECKOUT_BUCKET_TEST")
def setUp(self): self.remaining_time = SpecificRemainingTime(10) Config.set_config(BucketConfig.TEST) self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES") self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES") self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") class VT(Visitation): def walker_walk(self): pass registered_visitations.registered_visitations['VT'] = VT self.job_state = { '_visitation_class_name': 'VT', 'work_ids': ['1', '2', '3', '4'], '_number_of_workers': 3, } self.walker_state = { '_visitation_class_name': 'VT', 'work_ids': [['1', '2'], ['3', '4']], }
def setUp(self): self.context = MockLambdaContext() dss.Config.set_config(dss.BucketConfig.TEST) self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES") self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES") self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") class VT(Visitation): def walker_walk(self): pass registered_visitations.registered_visitations['VT'] = VT self.job_state = { '_visitation_class_name': 'VT', 'work_ids': ['1', '2', '3', '4'], '_number_of_workers': 3, } self.walker_state = { '_visitation_class_name': 'VT', 'work_ids': [['1', '2'], ['3', '4']], }
def _test_bundle_notification(self, replica): bucket = get_env('DSS_S3_BUCKET_TEST') key = f"notification-v2/{uuid4()}" url = self.s3.generate_presigned_url( ClientMethod='put_object', Params=dict(Bucket=bucket, Key=key, ContentType="application/json")) subscription = self._put_subscription( { 'callback_url': url, 'method': "PUT", }, replica) # upload test bundle from test fixtures bucket bundle_uuid, bundle_version = self._upload_bundle(replica) notification = self._get_notification_from_s3_object(bucket, key) self.assertEquals(notification['subscription_id'], subscription['uuid'])
def test_zero_copy(self): test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") test_src_key = infra.generate_test_key() s3_blobstore = Config.get_blobstore_handle(Replica.aws) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.seek(0) s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh) src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key) test_dst_key = infra.generate_test_key() state = s3copyclient.copy_sfn_event(test_bucket, test_src_key, test_bucket, test_dst_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}", execution_id, state) self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
def setUp(self, rounds=3): Config.set_config(BucketConfig.TEST) self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST") self.gs_blobstore = Config.get_blobstore_handle(Replica.gcp) test_src_keys = [infra.generate_test_key() for _ in range(rounds)] final_key = infra.generate_test_key() bucket_obj = self.gs_blobstore.gcp_client.bucket(self.test_bucket) self.gs_blobstore.upload_file_handle( self.test_bucket, test_src_keys[0], io.BytesIO(os.urandom(1024 * 1024))) for ix in range(len(test_src_keys) - 1): src_blob_obj = bucket_obj.get_blob(test_src_keys[ix]) blobs = [src_blob_obj for _ in range(16)] dst_blob_obj = bucket_obj.blob(test_src_keys[ix + 1]) dst_blob_obj.content_type = "application/octet-stream" dst_blob_obj.compose(blobs) # set the storage class to nearline. # NOTE: compose(…) does not seem to support setting a storage class. The canonical way of changing storage # class is to call update_storage_class(…), but Google's libraries does not seem to handle # update_storage_class(…) calls for large objects. final_blob_obj = bucket_obj.blob(final_key) final_blob_obj.storage_class = "NEARLINE" final_blob_src = bucket_obj.get_blob(test_src_keys[-1]) token = None while True: result = final_blob_obj.rewrite(final_blob_src, token=token) if result[0] is None: # done! break token = result[0] self.src_key = final_key
def test_versioned_tombstone_notifications(self, replica=Replica.aws): bucket = get_env('DSS_S3_BUCKET_TEST') notification_object_key = f"notification-v2/{uuid4()}" url = self.s3.generate_presigned_url( ClientMethod='put_object', Params=dict(Bucket=bucket, Key=notification_object_key, ContentType="application/json")) subscription = self._put_subscription( { 'callback_url': url, 'method': "PUT", 'jmespath_query': "admin_deleted==`true`" }, replica) bundle_uuid, bundle_version = self._upload_bundle(replica) self._tombstone_bundle(replica, bundle_uuid, bundle_version) notification = self._get_notification_from_s3_object( bucket, notification_object_key) self.assertEquals(notification['subscription_id'], subscription['uuid']) self.assertEquals(notification['match']['bundle_uuid'], bundle_uuid) self.assertEquals(notification['match']['bundle_version'], f"{bundle_version}")
from getm import default_chunk_size pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa sys.path.insert(0, pkg_root) # noqa from terra_notebook_utils import WORKSPACE_BUCKET from terra_notebook_utils.blobstore import BlobNotFoundError from terra_notebook_utils.blobstore.gs import GSBlobStore, GSBlob from terra_notebook_utils.blobstore.local import LocalBlobStore, LocalBlob from terra_notebook_utils.blobstore.url import URLBlob from terra_notebook_utils.blobstore import BlobStore, copy_client from tests import infra gs_blobstore = GSBlobStore(infra.get_env("TNU_BLOBSTORE_TEST_GS_BUCKET")) local_test_tempdir = tempfile.TemporaryDirectory() local_test_bucket = local_test_tempdir.name local_blobstore = LocalBlobStore(local_test_tempdir.name) logging.basicConfig(stream=sys.stderr, level=logging.INFO) copy_client.logger.setLevel(logging.DEBUG) class TestData: def __init__(self, oneshot_size: int = 7, multipart_size: int = 2 * default_chunk_size + 1): self.oneshot_size = oneshot_size self.multipart_size = multipart_size
import json import time import unittest import tempfile import subprocess from uuid import uuid4 from functools import wraps pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa sys.path.insert(0, pkg_root) # noqa from tests.infra import get_env TNU_REPO = "https://github.com/DataBiosphere/terra-notebook-utils" TNU_TEST_WORKSPACE = get_env("TNU_TEST_WORKSPACE") TNU_TEST_WORKSPACE_NAMESPACE = get_env("TNU_TEST_WORKSPACE_NAMESPACE") TNU_TEST_BUCKET = get_env("TNU_BLOBSTORE_TEST_GS_BUCKET") WORKSPACE_ARGS = f"--workspace {TNU_TEST_WORKSPACE} --workspace-namespace {TNU_TEST_WORKSPACE_NAMESPACE}" VENV_DIR = "venv" VENV_BIN = os.path.join(VENV_DIR, "bin") TNU = os.path.join(VENV_BIN, "tnu") DRS_URI_370_KB = "drs://dg.4503/6ffc2f59-2596-405c-befd-9634dc0ed837" # 1000 Genomes, 370.38 KB DRS_URI_021_MB = "drs://dg.4503/48286908-b079-4407-8773-5ab8ab42df12" # 1000 Genomes, 20.62 MB DRS_URI_240_MB = "drs://dg.4503/06ea6ade-f1cf-42b1-b6be-5a6f912ab965" # 1000 Genomes, 240.53 MB DRS_URI_702_MB = "drs://dg.4503/5cc56e78-cb80-4e3c-aa41-63ea3297d1f3" # 1000 Genomes, 702.57 MB DRS_URI_002_GB = "drs://dg.4503/076be06a-4251-4fe5-b02f-43600e909534" # 1000 Genomes, 1.66 GB DRS_URI_006_GB = "drs://dg.4503/ccae5e23-014d-47b1-89d3-049745a10120" # 1000 Genomes, 5.75 GB DRS_URI_025_GB = "drs://dg.4503/3e8438ec-9a7f-4215-8c23-de2c321aeb42" # 1000 Genomes, 24.82 GB
def setUp(self): self.credentials = infra.get_env("GOOGLE_APPLICATION_CREDENTIALS") self.test_bucket = infra.get_env("GS_BUCKET") self.test_fixtures_bucket = infra.get_env("GS_BUCKET_FIXTURES") self.handle = GSBlobStore.from_auth_credentials(self.credentials)
def setUpClass(cls): with open(get_env('GOOGLE_APPLICATION_CREDENTIALS'), "r") as fh: cls.owner = json.loads(fh.read())['client_email'] cls.app = ThreadedLocalServer(handler_cls=MyHandlerClass) cls.app.start() cls.s3 = boto3.client('s3')
import jsonschema import google_crc32c pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa sys.path.insert(0, pkg_root) # noqa from tests import config # initialize the test environment from tests import CLITestMixin from tests.infra import SuppressWarningsMixin, get_env from tests.infra.testmode import testmode from terra_notebook_utils import drs, gs, WORKSPACE_BUCKET, WORKSPACE_NAME, WORKSPACE_NAMESPACE import terra_notebook_utils.cli.commands.drs TNU_TEST_GS_BUCKET = get_env("TNU_BLOBSTORE_TEST_GS_BUCKET") DRS_URI_500_KB = "drs://dg.4503/5ec0e501-432e-4cad-808d-1a4e9100b7de" # 1000 Genomes, 500.15 KB DRS_URI_370_KB = "drs://dg.4503/6ffc2f59-2596-405c-befd-9634dc0ed837" # 1000 Genomes, 370.38 KB DRS_URI_003_MB = "drs://dg.4503/0f26beeb-d468-405e-abb7-412eb7bf8b19" # 1000 Genomes, 2.5 MB # These tests will only run on `make dev_env_access_test` command as they are testing DRS against Terra Dev env @testmode("dev_env_access") class TestTerraNotebookUtilsDRSInDev(SuppressWarningsMixin, unittest.TestCase): jade_dev_url = "drs://jade.datarepo-dev.broadinstitute.org/v1_0c86170e-312d-4b39-a0a4-" \ "2a2bfaa24c7a_c0e40912-8b14-43f6-9a2f-b278144d0060" def test_resolve_drs_for_google_storage(self): info = drs.get_drs_info(self.jade_dev_url) self.assertEqual(info.bucket_name, "broad-jade-dev-data-bucket")
def get_test_fixture_bucket(self, replica: Replica) -> str: if replica == Replica.aws: bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES") elif replica == Replica.gcp: bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES") return bucket
from getm import checksum, default_chunk_size from google.cloud import storage pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa sys.path.insert(0, pkg_root) # noqa from terra_notebook_utils.blobstore import BlobStore, BlobNotFoundError from terra_notebook_utils.blobstore.gs import GSBlobStore from terra_notebook_utils.blobstore.local import LocalBlobStore from terra_notebook_utils.blobstore.url import URLBlobStore from tests import infra gs_blobstore = GSBlobStore(infra.get_env("TNU_BLOBSTORE_TEST_GS_BUCKET")) local_test_tempdir = tempfile.TemporaryDirectory() local_test_bucket = local_test_tempdir.name local_blobstore = LocalBlobStore(local_test_tempdir.name) url_blobstore = URLBlobStore() gs_client = storage.Client.from_service_account_json( infra.get_env("TNU_GOOGLE_APPLICATION_CREDENTIALS")) def _gen_gs_signed_url(bucket_name: str, key: str) -> str: blob = gs_client.bucket(bucket_name).blob(key) return blob.generate_signed_url(datetime.timedelta(days=1), version="v4") def _put_blob(bs: BlobStore, data: bytes) -> str:
def get_test_fixture_bucket(replica: str) -> str: return get_env( "DSS_S3_BUCKET_TEST_FIXTURES") if replica == 'aws' else get_env( "DSS_GS_BUCKET_TEST_FIXTURES")
def setUp(self): self.test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") self.test_fixtures_bucket = infra.get_env( "DSS_S3_BUCKET_TEST_FIXTURES") self.blobhandle = S3BlobStore.from_environment() self.hcahandle = S3HCABlobStore(self.blobhandle)