コード例 #1
0
ファイル: test_data_transfer.py プロジェクト: zkan/quilt
    def test_multipart_upload(self):
        name = 'very_large_file.bin'
        path = pathlib.Path(name)

        size = 30 * 1024 * 1024
        chunksize = 8 * 1024 * 1024

        chunks = -(-size // chunksize)

        # Create an empty 30MB file; shouldn't take up any actual space on any reasonable filesystem.
        with open(path, 'wb') as fd:
            fd.seek(size - 1)
            fd.write(b'!')

        self.s3_stubber.add_client_error(method='head_object',
                                         http_status_code=404,
                                         expected_params={
                                             'Bucket': 'example',
                                             'Key': name,
                                         })

        self.s3_stubber.add_response(method='create_multipart_upload',
                                     service_response={'UploadId': '123'},
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': name,
                                     })

        for part_num in range(1, chunks + 1):
            self.s3_stubber.add_response(
                method='upload_part',
                service_response={'ETag': 'etag%d' % part_num},
                expected_params={
                    'Bucket': 'example',
                    'Key': name,
                    'UploadId': '123',
                    'Body': ANY,
                    'PartNumber': part_num
                })

        self.s3_stubber.add_response(method='complete_multipart_upload',
                                     service_response={},
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': name,
                                         'UploadId': '123',
                                         'MultipartUpload': {
                                             'Parts': [{
                                                 'ETag': 'etag%d' % i,
                                                 'PartNumber': i
                                             } for i in range(1, chunks + 1)]
                                         }
                                     })

        with mock.patch('quilt3.data_transfer.MAX_CONCURRENCY', 1):
            data_transfer.copy_file_list([
                (PhysicalKey.from_path(path),
                 PhysicalKey.from_url(f's3://example/{name}'),
                 path.stat().st_size),
            ])
コード例 #2
0
    def _test_download(self, *, threshold, chunksize, parts=data, devnull=False):
        dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst

        with self.s3_test_multi_thread_download(
            self.bucket, self.key, parts, threshold=threshold, chunksize=chunksize
        ):
            data_transfer.copy_file_list([(self.src, dst, self.size)])

        if not devnull:
            with open(self.filename, 'rb') as f:
                assert f.read() == self.data
コード例 #3
0
ファイル: test_data_transfer.py プロジェクト: ellisonbg/quilt
    def test_multipart_upload(self):
        path = DATA_DIR / 'large_file.npy'

        self.s3_stubber.add_client_error(method='head_object',
                                         http_status_code=404,
                                         expected_params={
                                             'Bucket': 'example',
                                             'Key': 'large_file.npy',
                                         })

        self.s3_stubber.add_response(method='create_multipart_upload',
                                     service_response={'UploadId': '123'},
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                         'Metadata': {
                                             'helium': '{}'
                                         }
                                     })

        with open(path, 'rb') as fd:
            for part_num in range(1, 6):
                self.s3_stubber.add_response(
                    method='upload_part',
                    service_response={'ETag': 'etag%d' % part_num},
                    expected_params={
                        'Bucket': 'example',
                        'Key': 'large_file.npy',
                        'UploadId': '123',
                        'Body': fd.read(2048),
                        'PartNumber': part_num
                    })

        self.s3_stubber.add_response(method='complete_multipart_upload',
                                     service_response={},
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                         'UploadId': '123',
                                         'MultipartUpload': {
                                             'Parts': [{
                                                 'ETag': 'etag%d' % i,
                                                 'PartNumber': i
                                             } for i in range(1, 6)]
                                         }
                                     })

        with mock.patch.object(data_transfer.s3_transfer_config, 'multipart_threshold', 4096), \
             mock.patch.object(data_transfer.s3_transfer_config, 'multipart_chunksize', 2048), \
             mock.patch('quilt3.data_transfer.s3_threads', 1):
            data_transfer.copy_file_list([
                (path.as_uri(), 's3://example/large_file.npy',
                 path.stat().st_size, None),
            ])
コード例 #4
0
    def test_copy_file_list_retry(self):
        bucket = 'test-bucket'
        other_bucket = f'{bucket}-other'
        key = 'dir/a'
        vid = None

        src = PhysicalKey(bucket, key, vid)
        dst = PhysicalKey(other_bucket, key, vid)

        with mock.patch('botocore.client.BaseClient._make_api_call',
                        side_effect=ClientError({}, 'CopyObject')) as mocked_api_call:
            with pytest.raises(ClientError):
                data_transfer.copy_file_list([(src, dst, 1)])
            self.assertEqual(mocked_api_call.call_count, data_transfer.MAX_COPY_FILE_LIST_RETRIES)
コード例 #5
0
    def test_upload_large_file_etag_mismatch(self):
        path = DATA_DIR / 'large_file.npy'

        self.s3_stubber.add_response(method='head_object',
                                     service_response={
                                         'ContentLength': path.stat().st_size,
                                         'ETag': '"123"',
                                         'VersionId': 'v1',
                                     },
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                     })

        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v2'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                     })

        urls = data_transfer.copy_file_list([
            (PhysicalKey.from_path(path),
             PhysicalKey.from_url('s3://example/large_file.npy'),
             path.stat().st_size),
        ])
        assert urls[0] == PhysicalKey.from_url(
            's3://example/large_file.npy?versionId=v2')
コード例 #6
0
    def test_upload_large_file(self):
        path = DATA_DIR / 'large_file.npy'

        self.s3_stubber.add_client_error(method='head_object',
                                         http_status_code=404,
                                         expected_params={
                                             'Bucket': 'example',
                                             'Key': 'large_file.npy',
                                         })

        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v1'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                     })

        urls = data_transfer.copy_file_list([
            (PhysicalKey.from_path(path),
             PhysicalKey.from_url('s3://example/large_file.npy'),
             path.stat().st_size),
        ])
        assert urls[0] == PhysicalKey.from_url(
            's3://example/large_file.npy?versionId=v1')
コード例 #7
0
ファイル: test_data_transfer.py プロジェクト: ellisonbg/quilt
    def test_upload_large_file_etag_match_metadata_match(self):
        path = DATA_DIR / 'large_file.npy'
        etag = data_transfer._calculate_etag(path)

        self.s3_stubber.add_response(method='head_object',
                                     service_response={
                                         'ContentLength': path.stat().st_size,
                                         'ETag': etag,
                                         'VersionId': 'v1',
                                         'Metadata': {
                                             'helium': '{"foo": "bar"}'
                                         }
                                     },
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                     })

        urls = data_transfer.copy_file_list([
            (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size,
             {
                 'foo': 'bar'
             }),
        ])
        assert urls[0] == 's3://example/large_file.npy?versionId=v1'
コード例 #8
0
ファイル: test_data_transfer.py プロジェクト: ellisonbg/quilt
    def test_upload_large_file_etag_mismatch(self):
        path = DATA_DIR / 'large_file.npy'

        self.s3_stubber.add_response(method='head_object',
                                     service_response={
                                         'ContentLength': path.stat().st_size,
                                         'ETag': '"123"',
                                         'VersionId': 'v1',
                                         'Metadata': {}
                                     },
                                     expected_params={
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                     })

        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v2'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                         'Metadata': {
                                             'helium': '{}'
                                         }
                                     })

        urls = data_transfer.copy_file_list([
            (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size,
             None),
        ])
        assert urls[0] == 's3://example/large_file.npy?versionId=v2'
コード例 #9
0
ファイル: test_data_transfer.py プロジェクト: ellisonbg/quilt
    def test_upload_large_file(self):
        path = DATA_DIR / 'large_file.npy'

        self.s3_stubber.add_client_error(method='head_object',
                                         http_status_code=404,
                                         expected_params={
                                             'Bucket': 'example',
                                             'Key': 'large_file.npy',
                                         })

        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v1'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example',
                                         'Key': 'large_file.npy',
                                         'Metadata': {
                                             'helium': '{}'
                                         }
                                     })

        urls = data_transfer.copy_file_list([
            (path.as_uri(), 's3://example/large_file.npy', path.stat().st_size,
             None),
        ])
        assert urls[0] == 's3://example/large_file.npy?versionId=v1'
コード例 #10
0
    def test_multi_upload(self):
        path1 = DATA_DIR / 'small_file.csv'
        path2 = DATA_DIR / 'dir/foo.txt'

        # Unversioned bucket
        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'null'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example1',
                                         'Key': 'foo.csv',
                                     })

        # Versioned bucket
        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v123'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example2',
                                         'Key': 'foo.txt',
                                     })

        # stubber expects responses in order, so disable multi-threading.
        with mock.patch(
                'quilt3.data_transfer.s3_transfer_config.max_request_concurrency',
                1):
            urls = data_transfer.copy_file_list([
                (path1.as_uri(), 's3://example1/foo.csv',
                 path1.stat().st_size),
                (path2.as_uri(), 's3://example2/foo.txt',
                 path2.stat().st_size),
            ])

            assert urls[0] == 's3://example1/foo.csv'
            assert urls[1] == 's3://example2/foo.txt?versionId=v123'
コード例 #11
0
    def test_copy_file_list_retry_non_client_error(self):
        """
        copy_file_list() is not retrying on random exceptions.
        """
        bucket = 'test-bucket'
        other_bucket = f'{bucket}-other'
        key = 'dir/a'
        vid = None

        src = PhysicalKey(bucket, key, vid)
        dst = PhysicalKey(other_bucket, key, vid)

        with mock.patch('botocore.client.BaseClient._make_api_call',
                        side_effect=Exception('test exception')) as mocked_api_call:
            with pytest.raises(Exception, match='test exception'):
                data_transfer.copy_file_list([(src, dst, 1)])
            assert mocked_api_call.call_count == 1
コード例 #12
0
    def _test_download(self,
                       *,
                       threshold,
                       chunksize,
                       parts=None,
                       devnull=False):
        num_parts = 1 if parts is None else len(parts)
        barrier = threading.Barrier(num_parts, timeout=2)

        def side_effect(*args, **kwargs):
            barrier.wait(
            )  # This ensures that we have concurrent calls to get_object().
            return {
                'VersionId':
                'v1',
                'Body':
                io.BytesIO(
                    self.data if parts is None else parts[kwargs['Range']]),
            }

        dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst
        with mock.patch('quilt3.data_transfer.s3_transfer_config.max_request_concurrency', num_parts), \
             mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_threshold', threshold), \
             mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_chunksize', chunksize), \
             mock.patch.object(self.s3_client, 'get_object', side_effect=side_effect) as get_object_mock:
            data_transfer.copy_file_list([(self.src, dst, self.size)])

            expected_params = {
                'Bucket': self.bucket,
                'Key': self.key,
            }

            if parts is None:
                get_object_mock.assert_called_once_with(**expected_params)
            else:
                get_object_mock.assert_has_calls(
                    [mock.call(**expected_params, Range=r) for r in parts],
                    any_order=True)
                assert len(get_object_mock.call_args_list) == num_parts

        if not devnull:
            with open(self.filename, 'rb') as f:
                assert f.read() == self.data
コード例 #13
0
    def test_copy_file_list_multipart_retry(self):
        bucket = 'test-bucket'
        other_bucket = f'{bucket}-other'
        key = 'dir/a'
        vid = None

        src = PhysicalKey(bucket, key, vid)
        dst = PhysicalKey(other_bucket, key, vid)
        parts = 2 * data_transfer.s3_transfer_config.max_request_concurrency
        size = parts * data_transfer.s3_transfer_config.multipart_threshold

        def side_effect(operation_name, *args, **kwargs):
            if operation_name == 'CreateMultipartUpload':
                return {'UploadId': '123'}
            time.sleep(0.1)
            raise ClientError({}, 'CopyObject')

        with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=side_effect):
            with pytest.raises(ClientError):
                data_transfer.copy_file_list([(src, dst, size)])
コード例 #14
0
ファイル: test_data_transfer.py プロジェクト: zkan/quilt
    def test_multi_upload(self):
        path1 = DATA_DIR / 'small_file.csv'
        path2 = DATA_DIR / 'dir/foo.txt'

        # Unversioned bucket
        self.s3_stubber.add_response(method='put_object',
                                     service_response={},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example1',
                                         'Key': 'foo.csv',
                                     })

        # Versioned bucket
        self.s3_stubber.add_response(method='put_object',
                                     service_response={'VersionId': 'v123'},
                                     expected_params={
                                         'Body': ANY,
                                         'Bucket': 'example2',
                                         'Key': 'foo.txt',
                                     })

        # stubber expects responses in order, so disable multi-threading.
        with mock.patch('quilt3.data_transfer.MAX_CONCURRENCY', 1):
            urls = data_transfer.copy_file_list([
                (PhysicalKey.from_path(path1),
                 PhysicalKey.from_url('s3://example1/foo.csv'),
                 path1.stat().st_size),
                (PhysicalKey.from_path(path2),
                 PhysicalKey.from_url('s3://example2/foo.txt'),
                 path2.stat().st_size),
            ])

            assert urls[0] == PhysicalKey.from_url('s3://example1/foo.csv')
            assert urls[1] == PhysicalKey.from_url(
                's3://example2/foo.txt?versionId=v123')
コード例 #15
0
    def test_multipart_copy(self):
        size = 100 * 1024 * 1024 * 1024

        # size / 8MB would give us 12501 chunks - but the maximum allowed is 10000,
        # so we should end with 16MB chunks instead.
        chunksize = 8 * 1024 * 1024
        assert size / chunksize > 10000
        chunksize *= 2

        chunks = -(-size // chunksize)
        assert chunks <= 10000

        self.s3_stubber.add_response(method='create_multipart_upload',
                                     service_response={'UploadId': '123'},
                                     expected_params={
                                         'Bucket': 'example2',
                                         'Key': 'large_file2.npy',
                                     })

        for part_num in range(1, chunks + 1):
            self.s3_stubber.add_response(
                method='upload_part_copy',
                service_response={
                    'CopyPartResult': {
                        'ETag': 'etag%d' % part_num
                    }
                },
                expected_params={
                    'Bucket':
                    'example2',
                    'Key':
                    'large_file2.npy',
                    'UploadId':
                    '123',
                    'PartNumber':
                    part_num,
                    'CopySource': {
                        'Bucket': 'example1',
                        'Key': 'large_file1.npy'
                    },
                    'CopySourceRange':
                    'bytes=%d-%d' % ((part_num - 1) * chunksize,
                                     min(part_num * chunksize, size) - 1)
                })

        self.s3_stubber.add_response(method='complete_multipart_upload',
                                     service_response={},
                                     expected_params={
                                         'Bucket': 'example2',
                                         'Key': 'large_file2.npy',
                                         'UploadId': '123',
                                         'MultipartUpload': {
                                             'Parts': [{
                                                 'ETag': 'etag%d' % i,
                                                 'PartNumber': i
                                             } for i in range(1, chunks + 1)]
                                         }
                                     })

        with mock.patch(
                'quilt3.data_transfer.s3_transfer_config.max_request_concurrency',
                1):
            stderr = io.StringIO()

            with redirect_stderr(stderr), mock.patch(
                    'quilt3.data_transfer.DISABLE_TQDM', False):
                data_transfer.copy_file_list([
                    (PhysicalKey.from_url('s3://example1/large_file1.npy'),
                     PhysicalKey.from_url('s3://example2/large_file2.npy'),
                     size),
                ])
            assert stderr.getvalue()
コード例 #16
0
ファイル: test_data_transfer.py プロジェクト: ellisonbg/quilt
    def test_multipart_copy(self):
        file_size = 5000

        self.s3_stubber.add_response(
            method='head_object',
            service_response={'Metadata': {
                'helium': '{"foo": "bar"}'
            }},
            expected_params={
                'Bucket': 'example1',
                'Key': 'large_file1.npy',
            })

        self.s3_stubber.add_response(method='create_multipart_upload',
                                     service_response={'UploadId': '123'},
                                     expected_params={
                                         'Bucket': 'example2',
                                         'Key': 'large_file2.npy',
                                         'Metadata': {
                                             'helium': '{"foo": "bar"}'
                                         }
                                     })

        for part_num in range(1, 4):
            self.s3_stubber.add_response(
                method='upload_part_copy',
                service_response={
                    'CopyPartResult': {
                        'ETag': 'etag%d' % part_num
                    }
                },
                expected_params={
                    'Bucket':
                    'example2',
                    'Key':
                    'large_file2.npy',
                    'UploadId':
                    '123',
                    'PartNumber':
                    part_num,
                    'CopySource': {
                        'Bucket': 'example1',
                        'Key': 'large_file1.npy'
                    },
                    'CopySourceRange':
                    'bytes=%d-%d' % ((part_num - 1) * 2048,
                                     min(part_num * 2048, file_size) - 1)
                })

        self.s3_stubber.add_response(method='complete_multipart_upload',
                                     service_response={},
                                     expected_params={
                                         'Bucket': 'example2',
                                         'Key': 'large_file2.npy',
                                         'UploadId': '123',
                                         'MultipartUpload': {
                                             'Parts': [{
                                                 'ETag': 'etag%d' % i,
                                                 'PartNumber': i
                                             } for i in range(1, 4)]
                                         }
                                     })

        with mock.patch.object(data_transfer.s3_transfer_config, 'multipart_threshold', 4096), \
             mock.patch.object(data_transfer.s3_transfer_config, 'multipart_chunksize', 2048), \
             mock.patch('quilt3.data_transfer.s3_threads', 1):
            data_transfer.copy_file_list([
                ('s3://example1/large_file1.npy',
                 's3://example2/large_file2.npy', file_size, None),
            ])