def test_multipart_upload(self): name = 'very_large_file.bin' path = pathlib.Path(name) size = 30 * 1024 * 1024 chunksize = 8 * 1024 * 1024 chunks = -(-size // chunksize) # Create an empty 30MB file; shouldn't take up any actual space on any reasonable filesystem. with open(path, 'wb') as fd: fd.seek(size - 1) fd.write(b'!') self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': name, }) self.s3_stubber.add_response(method='create_multipart_upload', service_response={'UploadId': '123'}, expected_params={ 'Bucket': 'example', 'Key': name, }) for part_num in range(1, chunks + 1): self.s3_stubber.add_response( method='upload_part', service_response={'ETag': 'etag%d' % part_num}, expected_params={ 'Bucket': 'example', 'Key': name, 'UploadId': '123', 'Body': ANY, 'PartNumber': part_num }) self.s3_stubber.add_response(method='complete_multipart_upload', service_response={}, expected_params={ 'Bucket': 'example', 'Key': name, 'UploadId': '123', 'MultipartUpload': { 'Parts': [{ 'ETag': 'etag%d' % i, 'PartNumber': i } for i in range(1, chunks + 1)] } }) with mock.patch('quilt3.data_transfer.MAX_CONCURRENCY', 1): data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url(f's3://example/{name}'), path.stat().st_size), ])
def _test_download(self, *, threshold, chunksize, parts=data, devnull=False): dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst with self.s3_test_multi_thread_download( self.bucket, self.key, parts, threshold=threshold, chunksize=chunksize ): data_transfer.copy_file_list([(self.src, dst, self.size)]) if not devnull: with open(self.filename, 'rb') as f: assert f.read() == self.data
def test_multipart_upload(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='create_multipart_upload', service_response={'UploadId': '123'}, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', 'Metadata': { 'helium': '{}' } }) with open(path, 'rb') as fd: for part_num in range(1, 6): self.s3_stubber.add_response( method='upload_part', service_response={'ETag': 'etag%d' % part_num}, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', 'UploadId': '123', 'Body': fd.read(2048), 'PartNumber': part_num }) self.s3_stubber.add_response(method='complete_multipart_upload', service_response={}, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', 'UploadId': '123', 'MultipartUpload': { 'Parts': [{ 'ETag': 'etag%d' % i, 'PartNumber': i } for i in range(1, 6)] } }) with mock.patch.object(data_transfer.s3_transfer_config, 'multipart_threshold', 4096), \ mock.patch.object(data_transfer.s3_transfer_config, 'multipart_chunksize', 2048), \ mock.patch('quilt3.data_transfer.s3_threads', 1): data_transfer.copy_file_list([ (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size, None), ])
def test_copy_file_list_retry(self): bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=ClientError({}, 'CopyObject')) as mocked_api_call: with pytest.raises(ClientError): data_transfer.copy_file_list([(src, dst, 1)]) self.assertEqual(mocked_api_call.call_count, data_transfer.MAX_COPY_FILE_LIST_RETRIES)
def test_upload_large_file_etag_mismatch(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_response(method='head_object', service_response={ 'ContentLength': path.stat().st_size, 'ETag': '"123"', 'VersionId': 'v1', }, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v2'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', }) urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/large_file.npy'), path.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url( 's3://example/large_file.npy?versionId=v2')
def test_upload_large_file(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v1'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', }) urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/large_file.npy'), path.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url( 's3://example/large_file.npy?versionId=v1')
def test_upload_large_file_etag_match_metadata_match(self): path = DATA_DIR / 'large_file.npy' etag = data_transfer._calculate_etag(path) self.s3_stubber.add_response(method='head_object', service_response={ 'ContentLength': path.stat().st_size, 'ETag': etag, 'VersionId': 'v1', 'Metadata': { 'helium': '{"foo": "bar"}' } }, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) urls = data_transfer.copy_file_list([ (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size, { 'foo': 'bar' }), ]) assert urls[0] == 's3://example/large_file.npy?versionId=v1'
def test_upload_large_file_etag_mismatch(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_response(method='head_object', service_response={ 'ContentLength': path.stat().st_size, 'ETag': '"123"', 'VersionId': 'v1', 'Metadata': {} }, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v2'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', 'Metadata': { 'helium': '{}' } }) urls = data_transfer.copy_file_list([ (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size, None), ]) assert urls[0] == 's3://example/large_file.npy?versionId=v2'
def test_upload_large_file(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v1'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', 'Metadata': { 'helium': '{}' } }) urls = data_transfer.copy_file_list([ (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size, None), ]) assert urls[0] == 's3://example/large_file.npy?versionId=v1'
def test_multi_upload(self): path1 = DATA_DIR / 'small_file.csv' path2 = DATA_DIR / 'dir/foo.txt' # Unversioned bucket self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'null'}, expected_params={ 'Body': ANY, 'Bucket': 'example1', 'Key': 'foo.csv', }) # Versioned bucket self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v123'}, expected_params={ 'Body': ANY, 'Bucket': 'example2', 'Key': 'foo.txt', }) # stubber expects responses in order, so disable multi-threading. with mock.patch( 'quilt3.data_transfer.s3_transfer_config.max_request_concurrency', 1): urls = data_transfer.copy_file_list([ (path1.as_uri(), 's3://example1/foo.csv', path1.stat().st_size), (path2.as_uri(), 's3://example2/foo.txt', path2.stat().st_size), ]) assert urls[0] == 's3://example1/foo.csv' assert urls[1] == 's3://example2/foo.txt?versionId=v123'
def test_copy_file_list_retry_non_client_error(self): """ copy_file_list() is not retrying on random exceptions. """ bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=Exception('test exception')) as mocked_api_call: with pytest.raises(Exception, match='test exception'): data_transfer.copy_file_list([(src, dst, 1)]) assert mocked_api_call.call_count == 1
def _test_download(self, *, threshold, chunksize, parts=None, devnull=False): num_parts = 1 if parts is None else len(parts) barrier = threading.Barrier(num_parts, timeout=2) def side_effect(*args, **kwargs): barrier.wait( ) # This ensures that we have concurrent calls to get_object(). return { 'VersionId': 'v1', 'Body': io.BytesIO( self.data if parts is None else parts[kwargs['Range']]), } dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst with mock.patch('quilt3.data_transfer.s3_transfer_config.max_request_concurrency', num_parts), \ mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_threshold', threshold), \ mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_chunksize', chunksize), \ mock.patch.object(self.s3_client, 'get_object', side_effect=side_effect) as get_object_mock: data_transfer.copy_file_list([(self.src, dst, self.size)]) expected_params = { 'Bucket': self.bucket, 'Key': self.key, } if parts is None: get_object_mock.assert_called_once_with(**expected_params) else: get_object_mock.assert_has_calls( [mock.call(**expected_params, Range=r) for r in parts], any_order=True) assert len(get_object_mock.call_args_list) == num_parts if not devnull: with open(self.filename, 'rb') as f: assert f.read() == self.data
def test_copy_file_list_multipart_retry(self): bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) parts = 2 * data_transfer.s3_transfer_config.max_request_concurrency size = parts * data_transfer.s3_transfer_config.multipart_threshold def side_effect(operation_name, *args, **kwargs): if operation_name == 'CreateMultipartUpload': return {'UploadId': '123'} time.sleep(0.1) raise ClientError({}, 'CopyObject') with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=side_effect): with pytest.raises(ClientError): data_transfer.copy_file_list([(src, dst, size)])
def test_multi_upload(self): path1 = DATA_DIR / 'small_file.csv' path2 = DATA_DIR / 'dir/foo.txt' # Unversioned bucket self.s3_stubber.add_response(method='put_object', service_response={}, expected_params={ 'Body': ANY, 'Bucket': 'example1', 'Key': 'foo.csv', }) # Versioned bucket self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v123'}, expected_params={ 'Body': ANY, 'Bucket': 'example2', 'Key': 'foo.txt', }) # stubber expects responses in order, so disable multi-threading. with mock.patch('quilt3.data_transfer.MAX_CONCURRENCY', 1): urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path1), PhysicalKey.from_url('s3://example1/foo.csv'), path1.stat().st_size), (PhysicalKey.from_path(path2), PhysicalKey.from_url('s3://example2/foo.txt'), path2.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url('s3://example1/foo.csv') assert urls[1] == PhysicalKey.from_url( 's3://example2/foo.txt?versionId=v123')
def test_multipart_copy(self): size = 100 * 1024 * 1024 * 1024 # size / 8MB would give us 12501 chunks - but the maximum allowed is 10000, # so we should end with 16MB chunks instead. chunksize = 8 * 1024 * 1024 assert size / chunksize > 10000 chunksize *= 2 chunks = -(-size // chunksize) assert chunks <= 10000 self.s3_stubber.add_response(method='create_multipart_upload', service_response={'UploadId': '123'}, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', }) for part_num in range(1, chunks + 1): self.s3_stubber.add_response( method='upload_part_copy', service_response={ 'CopyPartResult': { 'ETag': 'etag%d' % part_num } }, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', 'UploadId': '123', 'PartNumber': part_num, 'CopySource': { 'Bucket': 'example1', 'Key': 'large_file1.npy' }, 'CopySourceRange': 'bytes=%d-%d' % ((part_num - 1) * chunksize, min(part_num * chunksize, size) - 1) }) self.s3_stubber.add_response(method='complete_multipart_upload', service_response={}, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', 'UploadId': '123', 'MultipartUpload': { 'Parts': [{ 'ETag': 'etag%d' % i, 'PartNumber': i } for i in range(1, chunks + 1)] } }) with mock.patch( 'quilt3.data_transfer.s3_transfer_config.max_request_concurrency', 1): stderr = io.StringIO() with redirect_stderr(stderr), mock.patch( 'quilt3.data_transfer.DISABLE_TQDM', False): data_transfer.copy_file_list([ (PhysicalKey.from_url('s3://example1/large_file1.npy'), PhysicalKey.from_url('s3://example2/large_file2.npy'), size), ]) assert stderr.getvalue()
def test_multipart_copy(self): file_size = 5000 self.s3_stubber.add_response( method='head_object', service_response={'Metadata': { 'helium': '{"foo": "bar"}' }}, expected_params={ 'Bucket': 'example1', 'Key': 'large_file1.npy', }) self.s3_stubber.add_response(method='create_multipart_upload', service_response={'UploadId': '123'}, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', 'Metadata': { 'helium': '{"foo": "bar"}' } }) for part_num in range(1, 4): self.s3_stubber.add_response( method='upload_part_copy', service_response={ 'CopyPartResult': { 'ETag': 'etag%d' % part_num } }, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', 'UploadId': '123', 'PartNumber': part_num, 'CopySource': { 'Bucket': 'example1', 'Key': 'large_file1.npy' }, 'CopySourceRange': 'bytes=%d-%d' % ((part_num - 1) * 2048, min(part_num * 2048, file_size) - 1) }) self.s3_stubber.add_response(method='complete_multipart_upload', service_response={}, expected_params={ 'Bucket': 'example2', 'Key': 'large_file2.npy', 'UploadId': '123', 'MultipartUpload': { 'Parts': [{ 'ETag': 'etag%d' % i, 'PartNumber': i } for i in range(1, 4)] } }) with mock.patch.object(data_transfer.s3_transfer_config, 'multipart_threshold', 4096), \ mock.patch.object(data_transfer.s3_transfer_config, 'multipart_chunksize', 2048), \ mock.patch('quilt3.data_transfer.s3_threads', 1): data_transfer.copy_file_list([ ('s3://example1/large_file1.npy', 's3://example2/large_file2.npy', file_size, None), ])