def testUploadFileHandle(self):
        with self.subTest("without optional parameters"):
            fobj = io.BytesIO(b"abcabcabc")
            dst_blob_name = infra.generate_test_key()

            self.handle.upload_file_handle(self.test_bucket, dst_blob_name,
                                           fobj)

            # should be able to get metadata for the file.
            self.assertFalse(
                self.handle.get_user_metadata(self.test_bucket, dst_blob_name))

        with self.subTest("with optional parameters"):
            fobj = io.BytesIO(b"abcabcabc")
            dst_blob_name = infra.generate_test_key()

            content_type = "test/content-type"
            metadata = {"stuff": "things"}
            self.handle.upload_file_handle(
                self.test_bucket,
                dst_blob_name,
                fobj,
                content_type=content_type,
                metadata=metadata,
            )

            # should be able to get metadata for the file.
            self.assertEqual(
                self.handle.get_user_metadata(self.test_bucket, dst_blob_name),
                metadata)
            self.assertEqual(
                self.handle.get_content_type(self.test_bucket, dst_blob_name),
                content_type)
Esempio n. 2
0
 def _test_gs_cache(self, src_data, content_type, checkout_bucket):
     replica = Replica.gcp
     checkout_bucket = checkout_bucket if checkout_bucket else replica.checkout_bucket
     test_src_key = infra.generate_test_key()
     gs_blobstore = Config.get_blobstore_handle(Replica.gcp)
     client = storage.Client()
     # upload
     with tempfile.NamedTemporaryFile(delete=True) as fh:
         fh.write(src_data)
         fh.flush()
         fh.seek(0)
         gs_blobstore.upload_file_handle(replica.bucket, test_src_key, fh,
                                         content_type)
     # checkout
     test_dst_key = infra.generate_test_key()
     event = gscopyclient.copy_sfn_event(replica.bucket, test_src_key,
                                         checkout_bucket, test_dst_key)
     event = gscopyclient.implementation.setup_copy_task(event, None)
     spoof_context = self.SpoofContext()
     # parameters of copy_worker are arbitrary, only passed because required.
     event = gscopyclient.implementation.copy_worker(event, spoof_context)
     # verify
     for retry in [1, 1, 1]:
         try:
             bucket = client.get_bucket(checkout_bucket)
             blob_class = bucket.get_blob(test_dst_key).storage_class
         except AttributeError:
             time.sleep(retry)
         else:
             break
     # cleanup
     gs_blobstore.delete(replica.bucket, test_src_key)
     gs_blobstore.delete(checkout_bucket, test_dst_key)
     return blob_class
Esempio n. 3
0
    def test_large_copy(self, num_parts=LAMBDA_PARALLELIZATION_FACTOR + 1):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_client = boto3.client("s3")
        mpu = s3_client.create_multipart_upload(Bucket=test_bucket,
                                                Key=test_src_key)

        with ThreadPoolExecutor(max_workers=8) as tpe:
            parts_futures = tpe.map(
                lambda part_id: TestS3ParallelCopy.upload_part(
                    test_bucket, test_src_key, mpu['UploadId'], part_id),
                range(1, num_parts + 1))

        parts = [
            dict(ETag=part_etag, PartNumber=part_id)
            for part_id, part_etag in parts_futures
        ]

        src_etag = s3_client.complete_multipart_upload(
            Bucket=test_bucket,
            Key=test_src_key,
            MultipartUpload=dict(Parts=parts),
            UploadId=mpu['UploadId'],
        )['ETag'].strip('"')

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Esempio n. 4
0
 def _test_aws_cache(self, src_data, content_type, checkout_bucket):
     replica = Replica.aws
     checkout_bucket = checkout_bucket if checkout_bucket else replica.checkout_bucket
     test_src_key = infra.generate_test_key()
     s3_blobstore = Config.get_blobstore_handle(Replica.aws)
     # upload
     with tempfile.NamedTemporaryFile(delete=True) as fh:
         fh.write(src_data)
         fh.flush()
         fh.seek(0)
         s3_blobstore.upload_file_handle(replica.bucket, test_src_key, fh,
                                         content_type)
     # checkout
     test_dst_key = infra.generate_test_key()
     event = s3copyclient.copy_sfn_event(replica.bucket, test_src_key,
                                         checkout_bucket, test_dst_key)
     event = s3copyclient.implementation.setup_copy_task(event, None)
     spoof_context = self.SpoofContext()
     # parameters of copy_worker are arbitrary, only passed because required.
     event = s3copyclient.implementation.copy_worker(
         event, spoof_context, 10)
     # verify
     tagging = s3_blobstore.get_user_metadata(checkout_bucket, test_dst_key)
     # cleanup
     s3_blobstore.delete(replica.bucket, test_src_key)
     s3_blobstore.delete(checkout_bucket, test_dst_key)
     return tagging
Esempio n. 5
0
    def upload_file(self, contents):
        s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
        src_key = generate_test_key()
        s3 = boto3.resource('s3')
        with io.BytesIO(json.dumps(
                contents).encode()) as fh, ChecksummingSink() as sink:
            sink.write(fh.read())
            sums = sink.get_checksums()
            metadata = {
                'hca-dss-crc32c': sums['crc32c'].lower(),
                'hca-dss-s3_etag': sums['s3_etag'].lower(),
                'hca-dss-sha1': sums['sha1'].lower(),
                'hca-dss-sha256': sums['sha256'].lower()
            }
            fh.seek(0)
            # TODO: consider switching to unmanaged uploader (putobject w/blob)
            s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj(
                fh, ExtraArgs={"Metadata": metadata})
        source_url = f"s3://{s3_test_bucket}/{src_key}"
        file_uuid = str(uuid4())
        version = datetime_to_version_format(datetime.utcnow())
        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)

        resp_obj = self.assertPutResponse(str(urlbuilder),
                                          requests.codes.created,
                                          json_request_body=dict(
                                              creator_uid=0,
                                              source_url=source_url))
        return file_uuid, resp_obj.json["version"]
    def _test_file_put(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader):
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        # should be able to do this twice (i.e., same payload, different UUIDs)
        self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version)
        self.upload_file(source_url, str(uuid.uuid4()))

        # should be able to do this twice (i.e., same payload, same UUIDs)
        self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid,
                         version=version, expected_code=requests.codes.ok)

        # should *NOT* be able to do this twice (i.e., different payload, same UUIDs)
        self.upload_file(source_url, file_uuid, version=version, expected_code=requests.codes.conflict)
Esempio n. 7
0
    def _test_file_get_invalid_token(self, replica: Replica, scheme: str,
                                     test_bucket: str, uploader: Uploader):
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        # should be able to do this twice (i.e., same payload, different UUIDs)
        self.upload_file(source_url,
                         file_uuid,
                         bundle_uuid=bundle_uuid,
                         version=version)
        url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
            "replica",
            replica.name).add_query("version",
                                    version).add_query("token", "{}"))

        @eventually(30, 0.1)
        def try_get():
            self.assertGetResponse(url,
                                   requests.codes.bad_request,
                                   headers=get_auth_header())

        try_get()
    def testCopyTokenNotMatching(self):
        intermediate_blob_name = infra.generate_test_key()

        self.handle.copy(
            self.test_fixtures_bucket,
            "test_good_source_data/0",
            self.test_bucket,
            intermediate_blob_name,
        )

        cloud_checksum = self.handle.get_cloud_checksum(
            self.test_bucket, intermediate_blob_name)
        copy_token = self.handle.get_copy_token(self.test_bucket,
                                                intermediate_blob_name,
                                                cloud_checksum)

        self.handle.copy(
            self.test_fixtures_bucket,
            "test_good_source_data/1",
            self.test_bucket,
            intermediate_blob_name,
        )

        dst_blob_name = infra.generate_test_key()

        try:
            self.handle.copy(
                self.test_bucket,
                intermediate_blob_name,
                self.test_bucket,
                dst_blob_name,
                copy_token,
            )
        except BlobNotFoundError:
            return

        # either the file should be copied from the _previous_ contents, or it should not be present.
        try:
            dst_cloud_checksum = self.handle.get_cloud_checksum(
                self.test_bucket, dst_blob_name)
            self.assertEqual(dst_cloud_checksum, cloud_checksum)
        except BlobNotFoundError:
            pass
Esempio n. 9
0
    def test_zero_copy(self):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        s3_blobstore = Config.get_blobstore_handle(Replica.aws)

        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.seek(0)
            s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh)

        src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key)

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
    def testCopy(self):
        dst_blob_name = infra.generate_test_key()

        self.handle.copy(
            self.test_fixtures_bucket,
            "test_good_source_data/0",
            self.test_bucket,
            dst_blob_name,
        )

        # should be able to get metadata for the file.
        self.handle.get_user_metadata(self.test_bucket, dst_blob_name)
    def testDelete(self):
        fobj = io.BytesIO(b"abcabcabc")
        dst_blob_name = infra.generate_test_key()

        self.handle.upload_file_handle(self.test_bucket, dst_blob_name, fobj)

        # should be able to get metadata for the file.
        self.handle.get_user_metadata(self.test_bucket, dst_blob_name)

        self.handle.delete(self.test_bucket, dst_blob_name)

        with self.assertRaises(BlobNotFoundError):
            self.handle.get_user_metadata(self.test_bucket, dst_blob_name)
Esempio n. 12
0
    def testUploadFileHandle(self):
        fobj = io.BytesIO(b"abcabcabc")
        dst_blob_name = infra.generate_test_key()

        self.handle.upload_file_handle(
            self.test_bucket,
            dst_blob_name,
            fobj
        )

        # should be able to get metadata for the file.
        self.handle.get_user_metadata(
            self.test_bucket, dst_blob_name)
Esempio n. 13
0
    def _test_file_put_cached(self, replica: Replica, scheme: str,
                              test_bucket: str, test_checkout_bucket: str,
                              uploader: Uploader):
        stored_cache_criteria = os.environ.get('CHECKOUT_CACHE_CRITERIA')
        try:
            os.environ[
                'CHECKOUT_CACHE_CRITERIA'] = '[{"type":"application/json","max_size":12314}]'
            handle = Config.get_blobstore_handle(replica)
            src_key = generate_test_key()
            src_data = b'{"status":"valid"}'
            source_url = f"{scheme}://{test_bucket}/{src_key}"
            file_uuid = str(uuid.uuid4())
            bundle_uuid = str(uuid.uuid4())
            version = datetime_to_version_format(datetime.datetime.utcnow())

            # write dummy file and upload to upload area
            with tempfile.NamedTemporaryFile(delete=True) as fh:
                fh.write(src_data)
                fh.flush()

                uploader.checksum_and_upload_file(fh.name, src_key,
                                                  "application/json")

            # upload file to DSS
            self.upload_file(source_url,
                             file_uuid,
                             bundle_uuid=bundle_uuid,
                             version=version)

            metadata = handle.get_user_metadata(test_bucket, src_key)
            dst_key = ("blobs/" + ".".join([
                metadata['hca-dss-sha256'], metadata['hca-dss-sha1'],
                metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c']
            ])).lower()

            for wait_to_upload_into_checkout_bucket in range(30):
                try:
                    # get uploaded blob key from the checkout bucket
                    file_metadata = json.loads(
                        handle.get(test_checkout_bucket,
                                   dst_key).decode("utf-8"))
                    break
                except BlobNotFoundError:
                    time.sleep(1)
            else:
                file_metadata = json.loads(
                    handle.get(test_checkout_bucket, dst_key).decode("utf-8"))
            assert file_metadata[
                "status"] == "valid"  # the file exists in the checkout bucket
        finally:
            os.environ['CHECKOUT_CACHE_CRITERIA'] = stored_cache_criteria
Esempio n. 14
0
    def setUp(self, rounds=3):
        Config.set_config(BucketConfig.TEST)

        self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST")
        self.gs_blobstore = Config.get_blobstore_handle(Replica.gcp)
        test_src_keys = [infra.generate_test_key() for _ in range(rounds)]
        final_key = infra.generate_test_key()

        bucket_obj = self.gs_blobstore.gcp_client.bucket(self.test_bucket)

        self.gs_blobstore.upload_file_handle(
            self.test_bucket, test_src_keys[0],
            io.BytesIO(os.urandom(1024 * 1024)))

        for ix in range(len(test_src_keys) - 1):
            src_blob_obj = bucket_obj.get_blob(test_src_keys[ix])
            blobs = [src_blob_obj for _ in range(16)]
            dst_blob_obj = bucket_obj.blob(test_src_keys[ix + 1])

            dst_blob_obj.content_type = "application/octet-stream"
            dst_blob_obj.compose(blobs)

        # set the storage class to nearline.
        # NOTE: compose(…) does not seem to support setting a storage class.  The canonical way of changing storage
        # class is to call update_storage_class(…), but Google's libraries does not seem to handle
        # update_storage_class(…) calls for large objects.
        final_blob_obj = bucket_obj.blob(final_key)
        final_blob_obj.storage_class = "NEARLINE"
        final_blob_src = bucket_obj.get_blob(test_src_keys[-1])
        token = None
        while True:
            result = final_blob_obj.rewrite(final_blob_src, token=token)
            if result[0] is None:
                # done!
                break
            token = result[0]

        self.src_key = final_key
Esempio n. 15
0
    def _test_put_auth_errors(self, scheme, test_bucket):
        src_key = generate_test_key()
        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        timestamp = datetime.datetime.utcnow()
        version = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ")

        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)
        self._test_auth_errors('put',
                               str(urlbuilder),
                               json_request_body=dict(bundle_uuid=bundle_uuid,
                                                      creator_uid=0,
                                                      source_url=source_url))
Esempio n. 16
0
    def upload_file(app, contents, replica):
        src_key = generate_test_key()
        encoded = json.dumps(contents).encode()
        chunk_size = get_s3_multipart_chunk_size(len(encoded))
        with io.BytesIO(encoded) as fh, ChecksummingSink(
                write_chunk_size=chunk_size) as sink:
            sink.write(fh.read())
            sums = sink.get_checksums()
            metadata = {
                'hca-dss-crc32c': sums['crc32c'].lower(),
                'hca-dss-s3_etag': sums['s3_etag'].lower(),
                'hca-dss-sha1': sums['sha1'].lower(),
                'hca-dss-sha256': sums['sha256'].lower()
            }
            fh.seek(0)

            if replica == 'gcp':
                gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")
                gcp_client = gs_storage.Client.from_service_account_json(
                    os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
                gs_bucket = gcp_client.bucket(gs_test_bucket)
                blob = gs_bucket.blob(src_key)
                blob.upload_from_file(fh, content_type="application/json")
                blob.metadata = metadata
                blob.patch()
                source_url = f"gs://{gs_test_bucket}/{src_key}"

            if replica == 'aws':
                # TODO: consider switching to unmanaged uploader (putobject w/blob)
                s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
                s3 = boto3.resource('s3')
                s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj(
                    fh, ExtraArgs={"Metadata": metadata})
                source_url = f"s3://{s3_test_bucket}/{src_key}"

        file_uuid = str(uuid4())
        version = datetime_to_version_format(datetime.utcnow())
        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)

        resp_obj = app.put(str(urlbuilder),
                           json=dict(creator_uid=0, source_url=source_url),
                           headers=get_auth_header())
        resp_obj.raise_for_status()
        return file_uuid, resp_obj.json()["version"]
Esempio n. 17
0
    def _test_file_put_large(self, src_data: bytes) -> None:
        replicas: typing.Sequence[typing.Tuple[Replica, typing.Type[Uploader],
                                               str]] = [
                                                   (Replica.aws, S3Uploader,
                                                    self.s3_test_bucket),
                                                   (Replica.gcp, GSUploader,
                                                    self.gs_test_bucket)
                                               ]
        src_key = generate_test_key()
        for replica, uploader_class, bucket in replicas:
            self._upload_file_to_mock_ingest(uploader_class, bucket, src_key,
                                             src_data)

            expect_async_results: typing.Tuple[typing.Optional[bool], ...]
            if len(src_data) > ASYNC_COPY_THRESHOLD:
                if Replica == Replica.aws:
                    # We should be able to do this twice (i.e., same payload, different UUIDs).
                    # First time should be asynchronous since it's new data.  Second time should be
                    # synchronous since the data is present, but because S3 does not make
                    # consistency guarantees, a second client might not see that the data is already
                    # there.  Therefore, we do not mandate that it is done synchronously.
                    expect_async_results = (True, None)
                else:
                    # We should be able to do this twice (i.e., same payload, different UUIDs).
                    # First time should be asynchronous since it's new data.  Second time should be
                    # synchronous since the data is present.
                    expect_async_results = (True, False)
            else:
                # We should be able to do this twice (i.e., same payload, different UUIDs).  Neither
                # time should be asynchronous.
                expect_async_results = (False, False)

            for ix, expect_async in enumerate(expect_async_results):
                with self.subTest(
                        f"replica: {replica.name} size: {len(src_data)} round: {ix}"
                ):
                    resp_obj = self.upload_file_wait(
                        f"{replica.storage_schema}://{bucket}/{src_key}",
                        replica,
                        expect_async=expect_async)
                    self.assertHeaders(resp_obj.response, {
                        'content-type': "application/json",
                    })
                    self.assertIn('version', resp_obj.json)
    def testCopyTokenMatching(self):
        cloud_checksum = self.handle.get_cloud_checksum(
            self.test_fixtures_bucket, "test_good_source_data/0")
        copy_token = self.handle.get_copy_token(self.test_fixtures_bucket,
                                                "test_good_source_data/0",
                                                cloud_checksum)

        dst_blob_name = infra.generate_test_key()

        self.handle.copy(
            self.test_fixtures_bucket,
            "test_good_source_data/0",
            self.test_bucket,
            dst_blob_name,
            copy_token,
        )

        # should be able to get metadata for the file.
        self.handle.get_user_metadata(self.test_bucket, dst_blob_name)
Esempio n. 19
0
    def test_file_put_large_incorrect_s3_etag(self) -> None:
        bucket = self.s3_test_bucket
        src_key = generate_test_key()
        src_data = os.urandom(ASYNC_COPY_THRESHOLD + 1)

        # upload file with incompatible s3 part size
        self._upload_file_to_mock_ingest(S3Uploader,
                                         bucket,
                                         src_key,
                                         src_data,
                                         s3_part_size=6 * 1024 * 1024)

        file_uuid = str(uuid.uuid4())
        timestamp = datetime.datetime.utcnow()
        file_version = datetime_to_version_format(timestamp)
        url = UrlBuilder().set(path=f"/v1/files/{file_uuid}")
        url.add_query("version", file_version)
        source_url = f"s3://{bucket}/{src_key}"

        # put file into DSS, starting an async copy which will fail
        expected_codes = requests.codes.accepted,
        self.assertPutResponse(str(url),
                               expected_codes,
                               json_request_body=dict(
                                   file_uuid=file_uuid,
                                   creator_uid=0,
                                   source_url=source_url,
                               ),
                               headers=get_auth_header())

        # should eventually get unprocessable after async copy fails
        @eventually(120, 1)
        def tryHead():
            self.assertHeadResponse(
                f"/v1/files/{file_uuid}?replica=aws&version={file_version}",
                requests.codes.unprocessable)

        tryHead()

        # should get unprocessable on GCP too
        self.assertHeadResponse(
            f"/v1/files/{file_uuid}?replica=gcp&version={file_version}",
            requests.codes.unprocessable)
    def _test_file_put_large(self, replica: Replica, test_bucket: str, upload_func: typing.Callable[[str, str], None]):
        src_key = generate_test_key()
        upload_func(test_bucket, src_key)

        # We should be able to do this twice (i.e., same payload, different UUIDs).  First time should be asynchronous
        # since it's new data.  Second time should be synchronous since the data is present, but because S3 does not
        # make consistency guarantees, a second client might not see that the data is already there.  Therefore, we do
        # not mandate that it is done synchronously.
        for expect_async in [True, None]:
            resp_obj = self.upload_file_wait(
                f"{replica.storage_schema}://{test_bucket}/{src_key}",
                replica,
                expect_async=expect_async)
            self.assertHeaders(
                resp_obj.response,
                {
                    'content-type': "application/json",
                }
            )
            self.assertIn('version', resp_obj.json)
Esempio n. 21
0
    def test_simple_copy(self):
        dest_key = infra.generate_test_key()

        state = copy_sfn_event(self.test_bucket, self.src_key,
                               self.test_bucket, dest_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-gs-copy-sfn-{stage}",
                                            execution_id, state)

        # verify that the destination has the same checksum.
        src_checksum = self.gs_blobstore.get_cloud_checksum(
            self.test_bucket, self.src_key)

        @eventually(30.0, 1.0, {BlobNotFoundError, AssertionError})
        def test_output():
            dst_checksum = self.gs_blobstore.get_cloud_checksum(
                self.test_bucket, dest_key)
            self.assertEqual(src_checksum, dst_checksum)

        test_output()
    def _test_file_size(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader):
        src_key = generate_test_key()
        src_size = 1024 + int.from_bytes(os.urandom(1), byteorder='little')
        src_data = os.urandom(src_size)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%S.%fZ")

        self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version)

        url = str(UrlBuilder()
                  .set(path="/v1/files/" + file_uuid)
                  .add_query("replica", replica.name))

        for i in range(FILE_GET_RETRY_COUNT):
            with override_bucket_config(BucketConfig.TEST):
                resp_obj = self.assertGetResponse(
                    url,
                    [requests.codes.found, requests.codes.moved]
                )
                if resp_obj.response.status_code == requests.codes.found:
                    url = resp_obj.response.headers['Location']
                    data = requests.get(url)
                    self.assertEqual(len(data.content), src_size)
                    self.assertEqual(resp_obj.response.headers['X-DSS-SIZE'], str(src_size))
                    return
                elif resp_obj.response.status_code == requests.codes.moved:
                    retryAfter = int(resp_obj.response.headers['Retry-After'])
                    self.assertEqual(retryAfter, RETRY_AFTER_INTERVAL)
                    self.assertIn(url, resp_obj.response.headers['Location'])
        self.fail(f"Failed after {FILE_GET_RETRY_COUNT} retries.")
Esempio n. 23
0
    def test_file_put_bad_checksum(self):
        src_key = generate_test_key()
        uploader = S3Uploader(tempfile.gettempdir(), self.s3_test_bucket)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(os.urandom(1024))
            fh.flush()
            uploader.upload_file(fh.name,
                                 src_key,
                                 'text/plain',
                                 metadata_keys=self.bad_checksums)

        source_url = f's3://{self.s3_test_bucket}/{src_key}'
        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())
        # catch AssertionError raised when upload returns 422 instead of 201
        with self.assertRaises(AssertionError):
            r = self.upload_file(source_url,
                                 file_uuid,
                                 bundle_uuid=bundle_uuid,
                                 version=version)
            self.assertEqual(r.json['code'], 'invalid_checksum')
Esempio n. 24
0
    def _test_file_size(self, replica: Replica, scheme: str, test_bucket: str,
                        uploader: Uploader):
        src_key = generate_test_key()
        src_size = 1024 + int.from_bytes(os.urandom(1), byteorder='little')
        src_data = os.urandom(src_size)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%S.%fZ")

        self.upload_file(source_url,
                         file_uuid,
                         bundle_uuid=bundle_uuid,
                         version=version)

        url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
            "replica", replica.name))

        with override_bucket_config(BucketConfig.TEST):
            resp_obj = self.assertGetResponse(
                url,
                requests.codes.found,
                headers=get_auth_header(),
                redirect_follow_retries=FILE_GET_RETRY_COUNT,
                min_retry_interval_header=RETRY_AFTER_INTERVAL,
                override_retry_interval=1,
            )
            url = resp_obj.response.headers['Location']
            data = requests.get(url)
            self.assertEqual(len(data.content), src_size)
            self.assertEqual(resp_obj.response.headers['X-DSS-SIZE'],
                             str(src_size))
Esempio n. 25
0
    def _test_file_put(self, replica: Replica, scheme: str, test_bucket: str,
                       uploader: Uploader):
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        source_url = f"{scheme}://{test_bucket}/{src_key}"

        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        self._test_put_auth_errors(scheme, test_bucket)

        with self.subTest(
                f"{replica}: Created returned when uploading a file with a unique payload, and FQID"
        ):
            self.upload_file(source_url,
                             file_uuid,
                             bundle_uuid=bundle_uuid,
                             version=version)

        with self.subTest(
                f"{replica}: Created returned when uploading a file with same payload, and different FQID"
        ):
            self.upload_file(source_url, str(uuid.uuid4()))

        with self.subTest(
                f"{replica}: OK returned when uploading a file with the same payload, UUID,  version"
        ):
            self.upload_file(source_url,
                             file_uuid,
                             bundle_uuid=bundle_uuid,
                             version=version,
                             expected_code=requests.codes.ok)

        with self.subTest(
                f"{replica}: Conflict returned when uploading a file with a different payload and same FQID"
        ):
            src_key_temp = generate_test_key()
            src_data_temp = os.urandom(128)
            with tempfile.NamedTemporaryFile(delete=True) as fh:
                fh.write(src_data_temp)
                fh.flush()

                uploader.checksum_and_upload_file(fh.name, src_key_temp,
                                                  "text/plain")

            source_url_temp = f"{scheme}://{test_bucket}/{src_key_temp}"
            self.upload_file(source_url_temp,
                             file_uuid,
                             version=version,
                             expected_code=requests.codes.conflict)

        with self.subTest(
                f"{replica}: Bad returned when uploading a file with an invalid version"
        ):
            self.upload_file(source_url,
                             file_uuid,
                             version='',
                             expected_code=requests.codes.bad_request)

        invalid_version = 'ABCD'
        with self.subTest(
                f"{replica}: bad_request returned "
                f"when uploading a file with invalid version {invalid_version}"
        ):
            self.upload_file(source_url,
                             file_uuid,
                             version=invalid_version,
                             expected_code=requests.codes.bad_request)

        with self.subTest(
                f"{replica}: Bad returned when uploading a file without a version"
        ):
            self.upload_file(source_url,
                             file_uuid,
                             version='missing',
                             expected_code=requests.codes.bad_request)

        invalid_uuids = ['ABCD', '1234']
        for invalid_uuid in invalid_uuids:
            with self.subTest(
                    f"{replica}: Bad returned "
                    f"when uploading a file with invalid UUID {invalid_uuid}"):
                self.upload_file(source_url,
                                 invalid_uuid,
                                 expected_code=requests.codes.bad_request)

        with self.subTest(
                f"{replica}: forbidden returned "
                f"when uploading a file with without UUID {invalid_uuid}"):
            self.upload_file(source_url,
                             '',
                             expected_code=requests.codes.forbidden)
Esempio n. 26
0
    def _test_file_get_checkout(self, replica: Replica, scheme: str,
                                test_bucket: str, uploader: Uploader):
        handle = Config.get_blobstore_handle(replica)
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        source_url = f"{scheme}://{test_bucket}/{src_key}"
        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        # write dummy file and upload to upload area
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        # upload file to DSS
        self.upload_file(source_url,
                         file_uuid,
                         bundle_uuid=bundle_uuid,
                         version=version)
        url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
            "replica", replica.name).add_query("version", version))

        # get uploaded blob key
        file_metadata = json.loads(
            handle.get(test_bucket,
                       f"files/{file_uuid}.{version}").decode("utf-8"))
        file_key = compose_blob_key(file_metadata)

        @eventually(20, 1)
        def test_checkout():
            # assert 302 and verify checksum on checkout completion
            api_get = self.assertGetResponse(url,
                                             requests.codes.found,
                                             headers=get_auth_header(),
                                             redirect_follow_retries=0)
            file_get = requests.get(api_get.response.headers['Location'])
            self.assertTrue(file_get.ok)
            self.assertEquals(file_get.content, src_data)

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file."
        ):
            # assert 301 redirect on first GET
            self.assertGetResponse(url,
                                   requests.codes.moved,
                                   headers=get_auth_header(),
                                   redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days, hours=1, minutes=5)
                self.assertGetResponse(url,
                                       requests.codes.moved,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date = handle.get_creation_date(replica.checkout_bucket,
                                                     file_key)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                # assert 302 found on stale file and that last modified refreshes
                blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days + 1)
                self.assertGetResponse(url,
                                       requests.codes.found,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
                self.assertTrue(
                    creation_date > handle.get_creation_date(
                        replica.checkout_bucket, file_key),
                    f'\ncurr_creation_date: {creation_date}'
                    f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}'
                )

        handle.delete(test_bucket, f"files/{file_uuid}.{version}")
        handle.delete(replica.checkout_bucket, file_key)