Esempio n. 1
0
def _calculate_etag(file_path):
    """
    Attempts to calculate a local file's ETag the way S3 does:
    - Normal uploads: MD5 of the file
    - Multi-part uploads: MD5 of the (binary) MD5s of the parts, dash, number of parts
    We can't know how the file was actually uploaded - but we're assuming it was done using
    the default settings, which we get from `s3_transfer_config`.
    """
    size = pathlib.Path(file_path).stat().st_size
    with open(file_path, 'rb') as fd:
        if size <= s3_transfer_config.multipart_threshold:
            contents = fd.read()
            etag = hashlib.md5(contents).hexdigest()
        else:
            adjuster = ChunksizeAdjuster()
            chunksize = adjuster.adjust_chunksize(
                s3_transfer_config.multipart_chunksize, size)

            hashes = []
            while True:
                contents = fd.read(chunksize)
                if not contents:
                    break
                hashes.append(hashlib.md5(contents).digest())
            etag = '%s-%d' % (hashlib.md5(
                b''.join(hashes)).hexdigest(), len(hashes))
    return '"%s"' % etag
Esempio n. 2
0
def _upload_file(ctx, size, src_path, dest_bucket, dest_key):
    s3_client = ctx.s3_client_provider.standard_client

    if size < s3_transfer_config.multipart_threshold:
        with OSUtils().open_file_chunk_reader(src_path, 0, size,
                                              [ctx.progress]) as fd:
            resp = s3_client.put_object(
                Body=fd,
                Bucket=dest_bucket,
                Key=dest_key,
            )

        version_id = resp.get('VersionId')  # Absent in unversioned buckets.
        ctx.done(PhysicalKey(dest_bucket, dest_key, version_id))
    else:
        resp = s3_client.create_multipart_upload(
            Bucket=dest_bucket,
            Key=dest_key,
        )
        upload_id = resp['UploadId']

        adjuster = ChunksizeAdjuster()
        chunksize = adjuster.adjust_chunksize(
            s3_transfer_config.multipart_chunksize, size)

        chunk_offsets = list(range(0, size, chunksize))

        lock = Lock()
        remaining = len(chunk_offsets)
        parts = [None] * remaining

        def upload_part(i, start, end):
            nonlocal remaining
            part_id = i + 1
            with OSUtils().open_file_chunk_reader(src_path, start, end - start,
                                                  [ctx.progress]) as fd:
                part = s3_client.upload_part(Body=fd,
                                             Bucket=dest_bucket,
                                             Key=dest_key,
                                             UploadId=upload_id,
                                             PartNumber=part_id)
            with lock:
                parts[i] = {"PartNumber": part_id, "ETag": part["ETag"]}
                remaining -= 1
                done = remaining == 0

            if done:
                resp = s3_client.complete_multipart_upload(
                    Bucket=dest_bucket,
                    Key=dest_key,
                    UploadId=upload_id,
                    MultipartUpload={"Parts": parts})
                version_id = resp.get(
                    'VersionId')  # Absent in unversioned buckets.
                ctx.done(PhysicalKey(dest_bucket, dest_key, version_id))

        for i, start in enumerate(chunk_offsets):
            end = min(start + chunksize, size)
            ctx.run(upload_part, i, start, end)
Esempio n. 3
0
class TestAdjustChunksize(unittest.TestCase):
    def setUp(self):
        self.adjuster = ChunksizeAdjuster()

    def test_valid_chunksize(self):
        chunksize = 7 * (1024**2)
        file_size = 8 * (1024**2)
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, chunksize)

    def test_chunksize_below_minimum(self):
        chunksize = MIN_UPLOAD_CHUNKSIZE - 1
        file_size = 3 * MIN_UPLOAD_CHUNKSIZE
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE)

    def test_chunksize_above_maximum(self):
        chunksize = MAX_SINGLE_UPLOAD_SIZE + 1
        file_size = MAX_SINGLE_UPLOAD_SIZE * 2
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)

    def test_chunksize_too_small(self):
        chunksize = 7 * (1024**2)
        file_size = 5 * (1024**4)
        # If we try to upload a 5TB file, we'll need to use 896MB part
        # sizes.
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, 896 * (1024**2))
        num_parts = file_size / new_size
        self.assertLessEqual(num_parts, MAX_PARTS)

    def test_unknown_file_size_with_valid_chunksize(self):
        chunksize = 7 * (1024**2)
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, chunksize)

    def test_unknown_file_size_below_minimum(self):
        chunksize = MIN_UPLOAD_CHUNKSIZE - 1
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE)

    def test_unknown_file_size_above_maximum(self):
        chunksize = MAX_SINGLE_UPLOAD_SIZE + 1
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)
Esempio n. 4
0
class TestAdjustChunksize(unittest.TestCase):
    def setUp(self):
        self.adjuster = ChunksizeAdjuster()

    def test_valid_chunksize(self):
        chunksize = 7 * (1024 ** 2)
        file_size = 8 * (1024 ** 2)
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, chunksize)

    def test_chunksize_below_minimum(self):
        chunksize = MIN_UPLOAD_CHUNKSIZE - 1
        file_size = 3 * MIN_UPLOAD_CHUNKSIZE
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE)

    def test_chunksize_above_maximum(self):
        chunksize = MAX_SINGLE_UPLOAD_SIZE + 1
        file_size = MAX_SINGLE_UPLOAD_SIZE * 2
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)

    def test_chunksize_too_small(self):
        chunksize = 7 * (1024 ** 2)
        file_size = 5 * (1024 ** 4)
        # If we try to upload a 5TB file, we'll need to use 896MB part
        # sizes.
        new_size = self.adjuster.adjust_chunksize(chunksize, file_size)
        self.assertEqual(new_size, 896 * (1024 ** 2))
        num_parts = file_size / new_size
        self.assertLessEqual(num_parts, MAX_PARTS)

    def test_unknown_file_size_with_valid_chunksize(self):
        chunksize = 7 * (1024 ** 2)
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, chunksize)

    def test_unknown_file_size_below_minimum(self):
        chunksize = MIN_UPLOAD_CHUNKSIZE - 1
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE)

    def test_unknown_file_size_above_maximum(self):
        chunksize = MAX_SINGLE_UPLOAD_SIZE + 1
        new_size = self.adjuster.adjust_chunksize(chunksize)
        self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)
Esempio n. 5
0
    def setUp(self):
        super(BaseUploadTest, self).setUp()
        # TODO: We do not want to use the real MIN_UPLOAD_CHUNKSIZE
        # when we're adjusting parts.
        # This is really wasteful and fails CI builds because self.contents
        # would normally use 10MB+ of memory.
        # Until there's an API to configure this, we're patching this with
        # a min size of 1.  We can't patch MIN_UPLOAD_CHUNKSIZE directly
        # because it's already bound to a default value in the
        # chunksize adjuster.  Instead we need to patch out the
        # chunksize adjuster class.
        self.adjuster_patch = mock.patch('s3transfer.upload.ChunksizeAdjuster',
                                         lambda: ChunksizeAdjuster(min_size=1))
        self.adjuster_patch.start()
        self.config = TransferConfig(max_request_concurrency=1)
        self._manager = TransferManager(self.client, self.config)

        # Create a temporary directory with files to read from
        self.tempdir = tempfile.mkdtemp()
        self.filename = os.path.join(self.tempdir, 'myfile')
        self.content = b'my content'

        with open(self.filename, 'wb') as f:
            f.write(self.content)

        # Initialize some default arguments
        self.bucket = 'mybucket'
        self.key = 'mykey'
        self.extra_args = {}
        self.subscribers = []

        # A list to keep track of all of the bodies sent over the wire
        # and their order.
        self.sent_bodies = []
        self.client.meta.events.register('before-parameter-build.s3.*',
                                         self.collect_body)
Esempio n. 6
0
def _copy_remote_file(ctx, size, src_bucket, src_key, src_version,
                      dest_bucket, dest_key, extra_args=None):
    src_params = dict(
        Bucket=src_bucket,
        Key=src_key
    )
    if src_version is not None:
        src_params.update(
            VersionId=src_version
        )

    s3_client = ctx.s3_client_provider.standard_client

    if size < s3_transfer_config.multipart_threshold:
        params = dict(
            CopySource=src_params,
            Bucket=dest_bucket,
            Key=dest_key,
        )

        if extra_args:
            params.update(extra_args)

        resp = s3_client.copy_object(**params)
        ctx.progress(size)
        version_id = resp.get('VersionId')  # Absent in unversioned buckets.
        ctx.done(PhysicalKey(dest_bucket, dest_key, version_id))
    else:
        resp = s3_client.create_multipart_upload(
            Bucket=dest_bucket,
            Key=dest_key,
        )
        upload_id = resp['UploadId']

        adjuster = ChunksizeAdjuster()
        chunksize = adjuster.adjust_chunksize(s3_transfer_config.multipart_chunksize, size)

        chunk_offsets = list(range(0, size, chunksize))

        lock = Lock()
        remaining = len(chunk_offsets)
        parts = [None] * remaining

        def upload_part(i, start, end):
            nonlocal remaining
            part_id = i + 1
            part = s3_client.upload_part_copy(
                CopySource=src_params,
                CopySourceRange=f'bytes={start}-{end-1}',
                Bucket=dest_bucket,
                Key=dest_key,
                UploadId=upload_id,
                PartNumber=part_id
            )
            with lock:
                parts[i] = {"PartNumber": part_id, "ETag": part["CopyPartResult"]["ETag"]}
                remaining -= 1
                done = remaining == 0

            ctx.progress(end - start)

            if done:
                resp = s3_client.complete_multipart_upload(
                    Bucket=dest_bucket,
                    Key=dest_key,
                    UploadId=upload_id,
                    MultipartUpload={"Parts": parts}
                )
                version_id = resp.get('VersionId')  # Absent in unversioned buckets.
                ctx.done(PhysicalKey(dest_bucket, dest_key, version_id))

        for i, start in enumerate(chunk_offsets):
            end = min(start + chunksize, size)
            ctx.run(upload_part, i, start, end)
Esempio n. 7
0
    def _submit_multipart_request(self, client, config, osutil,
                                  request_executor, transfer_future):
        call_args = transfer_future.meta.call_args

        # Submit the request to create a multipart upload and make sure it
        # does not include any of the arguments used for copy part.
        create_multipart_extra_args = {}
        for param, val in call_args.extra_args.items():
            if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST:
                create_multipart_extra_args[param] = val

        create_multipart_future = self._transfer_coordinator.submit(
            request_executor,
            CreateMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': create_multipart_extra_args,
                }
            )
        )

        # Determine how many parts are needed based on filesize and
        # desired chunksize.
        part_size = config.multipart_chunksize
        adjuster = ChunksizeAdjuster()
        part_size = adjuster.adjust_chunksize(
            part_size, transfer_future.meta.size)
        num_parts = int(
            math.ceil(transfer_future.meta.size / float(part_size)))

        # Submit requests to upload the parts of the file.
        part_futures = []
        progress_callbacks = get_callbacks(transfer_future, 'progress')

        for part_number in range(1, num_parts + 1):
            extra_part_args = self._extra_upload_part_args(
                call_args.extra_args)
            # The part number for upload part starts at 1 while the
            # range parameter starts at zero, so just subtract 1 off of
            # the part number
            extra_part_args['CopySourceRange'] = calculate_range_parameter(
                part_size, part_number-1, num_parts, transfer_future.meta.size)
            # Get the size of the part copy as well for the progress
            # callbacks.
            size = self._get_transfer_size(
                part_size, part_number-1, num_parts, transfer_future.meta.size
            )
            part_futures.append(
                self._transfer_coordinator.submit(
                    request_executor,
                    CopyPartTask(
                        transfer_coordinator=self._transfer_coordinator,
                        main_kwargs={
                            'client': client,
                            'copy_source': call_args.copy_source,
                            'bucket': call_args.bucket,
                            'key': call_args.key,
                            'part_number': part_number,
                            'extra_args': extra_part_args,
                            'callbacks': progress_callbacks,
                            'size': size
                        },
                        pending_main_kwargs={
                            'upload_id': create_multipart_future
                        }
                    )
                )
            )

        complete_multipart_extra_args = self._extra_complete_multipart_args(
            call_args.extra_args)
        # Submit the request to complete the multipart upload.
        self._transfer_coordinator.submit(
            request_executor,
            CompleteMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': complete_multipart_extra_args,
                },
                pending_main_kwargs={
                    'upload_id': create_multipart_future,
                    'parts': part_futures
                },
                is_final=True
            )
        )
    def _submit_multipart_request(self, client, config, osutil,
                                  request_executor, transfer_future,
                                  upload_input_manager):
        call_args = transfer_future.meta.call_args

        # Submit the request to create a multipart upload.
        create_multipart_future = self._transfer_coordinator.submit(
            request_executor,
            CreateMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': call_args.extra_args,
                }
            )
        )

        # Submit requests to upload the parts of the file.
        part_futures = []
        extra_part_args = self._extra_upload_part_args(call_args.extra_args)

        # Get any tags that need to be associated to the submitted task
        # for upload the data
        upload_part_tag = self._get_upload_task_tag(
            upload_input_manager, 'upload_part')

        size = transfer_future.meta.size
        adjuster = ChunksizeAdjuster()
        chunksize = adjuster.adjust_chunksize(config.multipart_chunksize, size)
        part_iterator = upload_input_manager.yield_upload_part_bodies(
            transfer_future, chunksize)

        for part_number, fileobj in part_iterator:
            part_futures.append(
                self._transfer_coordinator.submit(
                    request_executor,
                    UploadPartTask(
                        transfer_coordinator=self._transfer_coordinator,
                        main_kwargs={
                            'client': client,
                            'fileobj': fileobj,
                            'bucket': call_args.bucket,
                            'key': call_args.key,
                            'part_number': part_number,
                            'extra_args': extra_part_args
                        },
                        pending_main_kwargs={
                            'upload_id': create_multipart_future
                        }
                    ),
                    tag=upload_part_tag
                )
            )

        complete_multipart_extra_args = self._extra_complete_multipart_args(
            call_args.extra_args)
        # Submit the request to complete the multipart upload.
        self._transfer_coordinator.submit(
            request_executor,
            CompleteMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': complete_multipart_extra_args,
                },
                pending_main_kwargs={
                    'upload_id': create_multipart_future,
                    'parts': part_futures
                },
                is_final=True
            )
        )
Esempio n. 9
0
 def setUp(self):
     self.adjuster = ChunksizeAdjuster()
Esempio n. 10
0
 def setUp(self):
     self.adjuster = ChunksizeAdjuster()
Esempio n. 11
0
    def _submit_multipart_request(self, client, config, osutil,
                                  request_executor, transfer_future,
                                  upload_input_manager):
        call_args = transfer_future.meta.call_args

        # Submit the request to create a multipart upload.
        create_multipart_future = self._transfer_coordinator.submit(
            request_executor,
            CreateMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': call_args.extra_args,
                }
            )
        )

        # Submit requests to upload the parts of the file.
        part_futures = []
        extra_part_args = self._extra_upload_part_args(call_args.extra_args)

        # Get any tags that need to be associated to the submitted task
        # for upload the data
        upload_part_tag = self._get_upload_task_tag(
            upload_input_manager, 'upload_part')

        size = transfer_future.meta.size
        adjuster = ChunksizeAdjuster()
        chunksize = adjuster.adjust_chunksize(config.multipart_chunksize, size)
        part_iterator = upload_input_manager.yield_upload_part_bodies(
            transfer_future, chunksize)

        for part_number, fileobj in part_iterator:
            part_futures.append(
                self._transfer_coordinator.submit(
                    request_executor,
                    UploadPartTask(
                        transfer_coordinator=self._transfer_coordinator,
                        main_kwargs={
                            'client': client,
                            'fileobj': fileobj,
                            'bucket': call_args.bucket,
                            'key': call_args.key,
                            'part_number': part_number,
                            'extra_args': extra_part_args
                        },
                        pending_main_kwargs={
                            'upload_id': create_multipart_future
                        }
                    ),
                    tag=upload_part_tag
                )
            )

        complete_multipart_extra_args = self._extra_complete_multipart_args(
            call_args.extra_args)
        # Submit the request to complete the multipart upload.
        self._transfer_coordinator.submit(
            request_executor,
            CompleteMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': complete_multipart_extra_args,
                },
                pending_main_kwargs={
                    'upload_id': create_multipart_future,
                    'parts': part_futures
                },
                is_final=True
            )
        )
Esempio n. 12
0
    def _submit_multipart_request(self, client, config, osutil,
                                  request_executor, transfer_future):
        call_args = transfer_future.meta.call_args

        # Submit the request to create a multipart upload and make sure it
        # does not include any of the arguments used for copy part.
        create_multipart_extra_args = {}
        for param, val in call_args.extra_args.items():
            if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST:
                create_multipart_extra_args[param] = val

        create_multipart_future = self._transfer_coordinator.submit(
            request_executor,
            CreateMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key,
                    'extra_args': create_multipart_extra_args,
                }
            )
        )

        # Determine how many parts are needed based on filesize and
        # desired chunksize.
        part_size = config.multipart_chunksize
        adjuster = ChunksizeAdjuster()
        part_size = adjuster.adjust_chunksize(
            part_size, transfer_future.meta.size)
        num_parts = int(
            math.ceil(transfer_future.meta.size / float(part_size)))

        # Submit requests to upload the parts of the file.
        part_futures = []
        progress_callbacks = get_callbacks(transfer_future, 'progress')

        for part_number in range(1, num_parts + 1):
            extra_part_args = self._extra_upload_part_args(
                call_args.extra_args)
            # The part number for upload part starts at 1 while the
            # range parameter starts at zero, so just subtract 1 off of
            # the part number
            extra_part_args['CopySourceRange'] = calculate_range_parameter(
                part_size, part_number-1, num_parts, transfer_future.meta.size)
            # Get the size of the part copy as well for the progress
            # callbacks.
            size = self._get_transfer_size(
                part_size, part_number-1, num_parts, transfer_future.meta.size
            )
            part_futures.append(
                self._transfer_coordinator.submit(
                    request_executor,
                    CopyPartTask(
                        transfer_coordinator=self._transfer_coordinator,
                        main_kwargs={
                            'client': client,
                            'copy_source': call_args.copy_source,
                            'bucket': call_args.bucket,
                            'key': call_args.key,
                            'part_number': part_number,
                            'extra_args': extra_part_args,
                            'callbacks': progress_callbacks,
                            'size': size
                        },
                        pending_main_kwargs={
                            'upload_id': create_multipart_future
                        }
                    )
                )
            )

        # Submit the request to complete the multipart upload.
        self._transfer_coordinator.submit(
            request_executor,
            CompleteMultipartUploadTask(
                transfer_coordinator=self._transfer_coordinator,
                main_kwargs={
                    'client': client,
                    'bucket': call_args.bucket,
                    'key': call_args.key
                },
                pending_main_kwargs={
                    'upload_id': create_multipart_future,
                    'parts': part_futures
                },
                is_final=True
            )
        )