Exemple #1
0
 def _upload_bundle(self, replica, uuid=None):
     if replica == Replica.aws:
         test_fixtures_bucket = get_env('DSS_S3_BUCKET_TEST_FIXTURES')
     else:
         test_fixtures_bucket = get_env('DSS_GS_BUCKET_TEST_FIXTURES')
     bundle_uuid = uuid if uuid else str(uuid4())
     file_uuid_1 = str(uuid4())
     file_uuid_2 = str(uuid4())
     filenames = ["file_1", "file_2"]
     resp_obj_1 = self.upload_file_wait(
         f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/0",
         replica,
         file_uuid_1,
         bundle_uuid=bundle_uuid,
     )
     resp_obj_2 = self.upload_file_wait(
         f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/1",
         replica,
         file_uuid_2,
         bundle_uuid=bundle_uuid,
     )
     file_version_1 = resp_obj_1.json['version']
     file_version_2 = resp_obj_2.json['version']
     bundle_version = datetime_to_version_format(datetime.datetime.utcnow())
     self.put_bundle(
         replica,
         bundle_uuid,
         [(file_uuid_1, file_version_1, filenames[0]),
          (file_uuid_2, file_version_2, filenames[1])],
         bundle_version,
     )
     return bundle_uuid, bundle_version
    def setUp(self):
        self.test_bucket = infra.get_env("S3_BUCKET")
        self.test_fixtures_bucket = infra.get_env("S3_BUCKET_FIXTURES")
        self.test_us_east_1_bucket = infra.get_env("S3_BUCKET_US_EAST_1")
        self.test_non_us_east_1_bucket = infra.get_env(
            "S3_BUCKET_NON_US_EAST_1")

        self.handle = S3BlobStore.from_environment()
Exemple #3
0
    def test_queue_notification(self):
        replica = Replica.aws
        bucket = get_env('DSS_S3_BUCKET_TEST')
        key = f"notification-v2/{uuid4()}"
        post = self.s3.generate_presigned_post(
            Bucket=bucket,
            Key=key,
            ExpiresIn=60,
            Fields={'Content-Type': "application/json"},
            Conditions=[{
                'Content-Type': "application/json"
            }])
        subscription = self._put_subscription(
            {
                'payload_form_field': "file",
                'form_fields': post['fields'],
                'callback_url': post['url'],
                'encoding': "multipart/form-data",
            }, replica)

        with SQSMessenger(get_queue_url(
                notify_v2.notification_queue_name)) as mq:
            msg = notify_v2._format_sqs_message(
                replica,
                subscription,
                "CREATE",
                "bundles/a47b90b2-0967-4fbf-87bc-c6c12db3fedf.2017-07-12T055120.037644Z",
            )
            mq.send(msg, delay_seconds=0)
        notification = self._get_notification_from_s3_object(bucket, key)
        self.assertEquals(notification['subscription_id'],
                          subscription['uuid'])
    def upload_file(self, contents):
        s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
        src_key = generate_test_key()
        s3 = boto3.resource('s3')
        with io.BytesIO(json.dumps(
                contents).encode()) as fh, ChecksummingSink() as sink:
            sink.write(fh.read())
            sums = sink.get_checksums()
            metadata = {
                'hca-dss-crc32c': sums['crc32c'].lower(),
                'hca-dss-s3_etag': sums['s3_etag'].lower(),
                'hca-dss-sha1': sums['sha1'].lower(),
                'hca-dss-sha256': sums['sha256'].lower()
            }
            fh.seek(0)
            # TODO: consider switching to unmanaged uploader (putobject w/blob)
            s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj(
                fh, ExtraArgs={"Metadata": metadata})
        source_url = f"s3://{s3_test_bucket}/{src_key}"
        file_uuid = str(uuid4())
        version = datetime_to_version_format(datetime.utcnow())
        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)

        resp_obj = self.assertPutResponse(str(urlbuilder),
                                          requests.codes.created,
                                          json_request_body=dict(
                                              creator_uid=0,
                                              source_url=source_url))
        return file_uuid, resp_obj.json["version"]
Exemple #5
0
    def test_large_copy(self, num_parts=LAMBDA_PARALLELIZATION_FACTOR + 1):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_client = boto3.client("s3")
        mpu = s3_client.create_multipart_upload(Bucket=test_bucket,
                                                Key=test_src_key)

        with ThreadPoolExecutor(max_workers=8) as tpe:
            parts_futures = tpe.map(
                lambda part_id: TestS3ParallelCopy.upload_part(
                    test_bucket, test_src_key, mpu['UploadId'], part_id),
                range(1, num_parts + 1))

        parts = [
            dict(ETag=part_etag, PartNumber=part_id)
            for part_id, part_etag in parts_futures
        ]

        src_etag = s3_client.complete_multipart_upload(
            Bucket=test_bucket,
            Key=test_src_key,
            MultipartUpload=dict(Parts=parts),
            UploadId=mpu['UploadId'],
        )['ETag'].strip('"')

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Exemple #6
0
    def upload_file(app, contents, replica):
        src_key = generate_test_key()
        encoded = json.dumps(contents).encode()
        chunk_size = get_s3_multipart_chunk_size(len(encoded))
        with io.BytesIO(encoded) as fh, ChecksummingSink(
                write_chunk_size=chunk_size) as sink:
            sink.write(fh.read())
            sums = sink.get_checksums()
            metadata = {
                'hca-dss-crc32c': sums['crc32c'].lower(),
                'hca-dss-s3_etag': sums['s3_etag'].lower(),
                'hca-dss-sha1': sums['sha1'].lower(),
                'hca-dss-sha256': sums['sha256'].lower()
            }
            fh.seek(0)

            if replica == 'gcp':
                gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")
                gcp_client = gs_storage.Client.from_service_account_json(
                    os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
                gs_bucket = gcp_client.bucket(gs_test_bucket)
                blob = gs_bucket.blob(src_key)
                blob.upload_from_file(fh, content_type="application/json")
                blob.metadata = metadata
                blob.patch()
                source_url = f"gs://{gs_test_bucket}/{src_key}"

            if replica == 'aws':
                # TODO: consider switching to unmanaged uploader (putobject w/blob)
                s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
                s3 = boto3.resource('s3')
                s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj(
                    fh, ExtraArgs={"Metadata": metadata})
                source_url = f"s3://{s3_test_bucket}/{src_key}"

        file_uuid = str(uuid4())
        version = datetime_to_version_format(datetime.utcnow())
        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)

        resp_obj = app.put(str(urlbuilder),
                           json=dict(creator_uid=0, source_url=source_url),
                           headers=get_auth_header())
        resp_obj.raise_for_status()
        return file_uuid, resp_obj.json()["version"]
Exemple #7
0
 def setUp(self):
     dss.Config.set_config(dss.BucketConfig.TEST)
     self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES")
     self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES")
     self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
     self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")
     self.s3_test_checkout_bucket = get_env("DSS_S3_CHECKOUT_BUCKET_TEST")
     self.gs_test_checkout_bucket = get_env("DSS_GS_CHECKOUT_BUCKET_TEST")
    def setUp(self):
        self.remaining_time = SpecificRemainingTime(10)
        Config.set_config(BucketConfig.TEST)
        self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES")
        self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES")
        self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
        self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")

        class VT(Visitation):
            def walker_walk(self):
                pass

        registered_visitations.registered_visitations['VT'] = VT

        self.job_state = {
            '_visitation_class_name': 'VT',
            'work_ids': ['1', '2', '3', '4'],
            '_number_of_workers': 3,
        }

        self.walker_state = {
            '_visitation_class_name': 'VT',
            'work_ids': [['1', '2'], ['3', '4']],
        }
Exemple #9
0
    def setUp(self):
        self.context = MockLambdaContext()
        dss.Config.set_config(dss.BucketConfig.TEST)
        self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES")
        self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES")
        self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
        self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")

        class VT(Visitation):
            def walker_walk(self):
                pass

        registered_visitations.registered_visitations['VT'] = VT

        self.job_state = {
            '_visitation_class_name': 'VT',
            'work_ids': ['1', '2', '3', '4'],
            '_number_of_workers': 3,
        }

        self.walker_state = {
            '_visitation_class_name': 'VT',
            'work_ids': [['1', '2'], ['3', '4']],
        }
Exemple #10
0
    def _test_bundle_notification(self, replica):
        bucket = get_env('DSS_S3_BUCKET_TEST')
        key = f"notification-v2/{uuid4()}"
        url = self.s3.generate_presigned_url(
            ClientMethod='put_object',
            Params=dict(Bucket=bucket, Key=key,
                        ContentType="application/json"))
        subscription = self._put_subscription(
            {
                'callback_url': url,
                'method': "PUT",
            }, replica)

        # upload test bundle from test fixtures bucket
        bundle_uuid, bundle_version = self._upload_bundle(replica)

        notification = self._get_notification_from_s3_object(bucket, key)
        self.assertEquals(notification['subscription_id'],
                          subscription['uuid'])
Exemple #11
0
    def test_zero_copy(self):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_blobstore = Config.get_blobstore_handle(Replica.aws)

        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.seek(0)
            s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh)

        src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key)

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Exemple #12
0
    def setUp(self, rounds=3):
        Config.set_config(BucketConfig.TEST)

        self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST")
        self.gs_blobstore = Config.get_blobstore_handle(Replica.gcp)
        test_src_keys = [infra.generate_test_key() for _ in range(rounds)]
        final_key = infra.generate_test_key()

        bucket_obj = self.gs_blobstore.gcp_client.bucket(self.test_bucket)

        self.gs_blobstore.upload_file_handle(
            self.test_bucket, test_src_keys[0],
            io.BytesIO(os.urandom(1024 * 1024)))

        for ix in range(len(test_src_keys) - 1):
            src_blob_obj = bucket_obj.get_blob(test_src_keys[ix])
            blobs = [src_blob_obj for _ in range(16)]
            dst_blob_obj = bucket_obj.blob(test_src_keys[ix + 1])

            dst_blob_obj.content_type = "application/octet-stream"
            dst_blob_obj.compose(blobs)

        # set the storage class to nearline.
        # NOTE: compose(…) does not seem to support setting a storage class.  The canonical way of changing storage
        # class is to call update_storage_class(…), but Google's libraries does not seem to handle
        # update_storage_class(…) calls for large objects.
        final_blob_obj = bucket_obj.blob(final_key)
        final_blob_obj.storage_class = "NEARLINE"
        final_blob_src = bucket_obj.get_blob(test_src_keys[-1])
        token = None
        while True:
            result = final_blob_obj.rewrite(final_blob_src, token=token)
            if result[0] is None:
                # done!
                break
            token = result[0]

        self.src_key = final_key
Exemple #13
0
    def test_versioned_tombstone_notifications(self, replica=Replica.aws):
        bucket = get_env('DSS_S3_BUCKET_TEST')
        notification_object_key = f"notification-v2/{uuid4()}"
        url = self.s3.generate_presigned_url(
            ClientMethod='put_object',
            Params=dict(Bucket=bucket,
                        Key=notification_object_key,
                        ContentType="application/json"))
        subscription = self._put_subscription(
            {
                'callback_url': url,
                'method': "PUT",
                'jmespath_query': "admin_deleted==`true`"
            }, replica)
        bundle_uuid, bundle_version = self._upload_bundle(replica)
        self._tombstone_bundle(replica, bundle_uuid, bundle_version)

        notification = self._get_notification_from_s3_object(
            bucket, notification_object_key)
        self.assertEquals(notification['subscription_id'],
                          subscription['uuid'])
        self.assertEquals(notification['match']['bundle_uuid'], bundle_uuid)
        self.assertEquals(notification['match']['bundle_version'],
                          f"{bundle_version}")
Exemple #14
0
from getm import default_chunk_size

pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                        '..'))  # noqa
sys.path.insert(0, pkg_root)  # noqa

from terra_notebook_utils import WORKSPACE_BUCKET
from terra_notebook_utils.blobstore import BlobNotFoundError
from terra_notebook_utils.blobstore.gs import GSBlobStore, GSBlob
from terra_notebook_utils.blobstore.local import LocalBlobStore, LocalBlob
from terra_notebook_utils.blobstore.url import URLBlob
from terra_notebook_utils.blobstore import BlobStore, copy_client

from tests import infra

gs_blobstore = GSBlobStore(infra.get_env("TNU_BLOBSTORE_TEST_GS_BUCKET"))
local_test_tempdir = tempfile.TemporaryDirectory()
local_test_bucket = local_test_tempdir.name
local_blobstore = LocalBlobStore(local_test_tempdir.name)

logging.basicConfig(stream=sys.stderr, level=logging.INFO)
copy_client.logger.setLevel(logging.DEBUG)


class TestData:
    def __init__(self,
                 oneshot_size: int = 7,
                 multipart_size: int = 2 * default_chunk_size + 1):
        self.oneshot_size = oneshot_size
        self.multipart_size = multipart_size
Exemple #15
0
import json
import time
import unittest
import tempfile
import subprocess
from uuid import uuid4
from functools import wraps

pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                        '..'))  # noqa
sys.path.insert(0, pkg_root)  # noqa

from tests.infra import get_env

TNU_REPO = "https://github.com/DataBiosphere/terra-notebook-utils"
TNU_TEST_WORKSPACE = get_env("TNU_TEST_WORKSPACE")
TNU_TEST_WORKSPACE_NAMESPACE = get_env("TNU_TEST_WORKSPACE_NAMESPACE")
TNU_TEST_BUCKET = get_env("TNU_BLOBSTORE_TEST_GS_BUCKET")
WORKSPACE_ARGS = f"--workspace {TNU_TEST_WORKSPACE} --workspace-namespace {TNU_TEST_WORKSPACE_NAMESPACE}"

VENV_DIR = "venv"
VENV_BIN = os.path.join(VENV_DIR, "bin")
TNU = os.path.join(VENV_BIN, "tnu")

DRS_URI_370_KB = "drs://dg.4503/6ffc2f59-2596-405c-befd-9634dc0ed837"  # 1000 Genomes, 370.38 KB
DRS_URI_021_MB = "drs://dg.4503/48286908-b079-4407-8773-5ab8ab42df12"  # 1000 Genomes, 20.62 MB
DRS_URI_240_MB = "drs://dg.4503/06ea6ade-f1cf-42b1-b6be-5a6f912ab965"  # 1000 Genomes, 240.53 MB
DRS_URI_702_MB = "drs://dg.4503/5cc56e78-cb80-4e3c-aa41-63ea3297d1f3"  # 1000 Genomes, 702.57 MB
DRS_URI_002_GB = "drs://dg.4503/076be06a-4251-4fe5-b02f-43600e909534"  # 1000 Genomes, 1.66 GB
DRS_URI_006_GB = "drs://dg.4503/ccae5e23-014d-47b1-89d3-049745a10120"  # 1000 Genomes, 5.75 GB
DRS_URI_025_GB = "drs://dg.4503/3e8438ec-9a7f-4215-8c23-de2c321aeb42"  # 1000 Genomes, 24.82 GB
 def setUp(self):
     self.credentials = infra.get_env("GOOGLE_APPLICATION_CREDENTIALS")
     self.test_bucket = infra.get_env("GS_BUCKET")
     self.test_fixtures_bucket = infra.get_env("GS_BUCKET_FIXTURES")
     self.handle = GSBlobStore.from_auth_credentials(self.credentials)
Exemple #17
0
 def setUpClass(cls):
     with open(get_env('GOOGLE_APPLICATION_CREDENTIALS'), "r") as fh:
         cls.owner = json.loads(fh.read())['client_email']
     cls.app = ThreadedLocalServer(handler_cls=MyHandlerClass)
     cls.app.start()
     cls.s3 = boto3.client('s3')
import jsonschema
import google_crc32c

pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                        '..'))  # noqa
sys.path.insert(0, pkg_root)  # noqa

from tests import config  # initialize the test environment
from tests import CLITestMixin
from tests.infra import SuppressWarningsMixin, get_env
from tests.infra.testmode import testmode
from terra_notebook_utils import drs, gs, WORKSPACE_BUCKET, WORKSPACE_NAME, WORKSPACE_NAMESPACE
import terra_notebook_utils.cli.commands.drs

TNU_TEST_GS_BUCKET = get_env("TNU_BLOBSTORE_TEST_GS_BUCKET")

DRS_URI_500_KB = "drs://dg.4503/5ec0e501-432e-4cad-808d-1a4e9100b7de"  # 1000 Genomes, 500.15 KB
DRS_URI_370_KB = "drs://dg.4503/6ffc2f59-2596-405c-befd-9634dc0ed837"  # 1000 Genomes, 370.38 KB
DRS_URI_003_MB = "drs://dg.4503/0f26beeb-d468-405e-abb7-412eb7bf8b19"  # 1000 Genomes, 2.5 MB


# These tests will only run on `make dev_env_access_test` command as they are testing DRS against Terra Dev env
@testmode("dev_env_access")
class TestTerraNotebookUtilsDRSInDev(SuppressWarningsMixin, unittest.TestCase):
    jade_dev_url = "drs://jade.datarepo-dev.broadinstitute.org/v1_0c86170e-312d-4b39-a0a4-" \
                   "2a2bfaa24c7a_c0e40912-8b14-43f6-9a2f-b278144d0060"

    def test_resolve_drs_for_google_storage(self):
        info = drs.get_drs_info(self.jade_dev_url)
        self.assertEqual(info.bucket_name, "broad-jade-dev-data-bucket")
 def get_test_fixture_bucket(self, replica: Replica) -> str:
     if replica == Replica.aws:
         bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES")
     elif replica == Replica.gcp:
         bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES")
     return bucket
from getm import checksum, default_chunk_size
from google.cloud import storage

pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                        '..'))  # noqa
sys.path.insert(0, pkg_root)  # noqa

from terra_notebook_utils.blobstore import BlobStore, BlobNotFoundError
from terra_notebook_utils.blobstore.gs import GSBlobStore
from terra_notebook_utils.blobstore.local import LocalBlobStore
from terra_notebook_utils.blobstore.url import URLBlobStore

from tests import infra

gs_blobstore = GSBlobStore(infra.get_env("TNU_BLOBSTORE_TEST_GS_BUCKET"))
local_test_tempdir = tempfile.TemporaryDirectory()
local_test_bucket = local_test_tempdir.name
local_blobstore = LocalBlobStore(local_test_tempdir.name)
url_blobstore = URLBlobStore()

gs_client = storage.Client.from_service_account_json(
    infra.get_env("TNU_GOOGLE_APPLICATION_CREDENTIALS"))


def _gen_gs_signed_url(bucket_name: str, key: str) -> str:
    blob = gs_client.bucket(bucket_name).blob(key)
    return blob.generate_signed_url(datetime.timedelta(days=1), version="v4")


def _put_blob(bs: BlobStore, data: bytes) -> str:
Exemple #21
0
 def get_test_fixture_bucket(replica: str) -> str:
     return get_env(
         "DSS_S3_BUCKET_TEST_FIXTURES") if replica == 'aws' else get_env(
             "DSS_GS_BUCKET_TEST_FIXTURES")
 def setUp(self):
     self.test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
     self.test_fixtures_bucket = infra.get_env(
         "DSS_S3_BUCKET_TEST_FIXTURES")
     self.blobhandle = S3BlobStore.from_environment()
     self.hcahandle = S3HCABlobStore(self.blobhandle)