def _calculate_etag(file_path): """ Attempts to calculate a local file's ETag the way S3 does: - Normal uploads: MD5 of the file - Multi-part uploads: MD5 of the (binary) MD5s of the parts, dash, number of parts We can't know how the file was actually uploaded - but we're assuming it was done using the default settings, which we get from `s3_transfer_config`. """ size = pathlib.Path(file_path).stat().st_size with open(file_path, 'rb') as fd: if size <= s3_transfer_config.multipart_threshold: contents = fd.read() etag = hashlib.md5(contents).hexdigest() else: adjuster = ChunksizeAdjuster() chunksize = adjuster.adjust_chunksize( s3_transfer_config.multipart_chunksize, size) hashes = [] while True: contents = fd.read(chunksize) if not contents: break hashes.append(hashlib.md5(contents).digest()) etag = '%s-%d' % (hashlib.md5( b''.join(hashes)).hexdigest(), len(hashes)) return '"%s"' % etag
def _upload_file(ctx, size, src_path, dest_bucket, dest_key): s3_client = ctx.s3_client_provider.standard_client if size < s3_transfer_config.multipart_threshold: with OSUtils().open_file_chunk_reader(src_path, 0, size, [ctx.progress]) as fd: resp = s3_client.put_object( Body=fd, Bucket=dest_bucket, Key=dest_key, ) version_id = resp.get('VersionId') # Absent in unversioned buckets. ctx.done(PhysicalKey(dest_bucket, dest_key, version_id)) else: resp = s3_client.create_multipart_upload( Bucket=dest_bucket, Key=dest_key, ) upload_id = resp['UploadId'] adjuster = ChunksizeAdjuster() chunksize = adjuster.adjust_chunksize( s3_transfer_config.multipart_chunksize, size) chunk_offsets = list(range(0, size, chunksize)) lock = Lock() remaining = len(chunk_offsets) parts = [None] * remaining def upload_part(i, start, end): nonlocal remaining part_id = i + 1 with OSUtils().open_file_chunk_reader(src_path, start, end - start, [ctx.progress]) as fd: part = s3_client.upload_part(Body=fd, Bucket=dest_bucket, Key=dest_key, UploadId=upload_id, PartNumber=part_id) with lock: parts[i] = {"PartNumber": part_id, "ETag": part["ETag"]} remaining -= 1 done = remaining == 0 if done: resp = s3_client.complete_multipart_upload( Bucket=dest_bucket, Key=dest_key, UploadId=upload_id, MultipartUpload={"Parts": parts}) version_id = resp.get( 'VersionId') # Absent in unversioned buckets. ctx.done(PhysicalKey(dest_bucket, dest_key, version_id)) for i, start in enumerate(chunk_offsets): end = min(start + chunksize, size) ctx.run(upload_part, i, start, end)
class TestAdjustChunksize(unittest.TestCase): def setUp(self): self.adjuster = ChunksizeAdjuster() def test_valid_chunksize(self): chunksize = 7 * (1024**2) file_size = 8 * (1024**2) new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, chunksize) def test_chunksize_below_minimum(self): chunksize = MIN_UPLOAD_CHUNKSIZE - 1 file_size = 3 * MIN_UPLOAD_CHUNKSIZE new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE) def test_chunksize_above_maximum(self): chunksize = MAX_SINGLE_UPLOAD_SIZE + 1 file_size = MAX_SINGLE_UPLOAD_SIZE * 2 new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE) def test_chunksize_too_small(self): chunksize = 7 * (1024**2) file_size = 5 * (1024**4) # If we try to upload a 5TB file, we'll need to use 896MB part # sizes. new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, 896 * (1024**2)) num_parts = file_size / new_size self.assertLessEqual(num_parts, MAX_PARTS) def test_unknown_file_size_with_valid_chunksize(self): chunksize = 7 * (1024**2) new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, chunksize) def test_unknown_file_size_below_minimum(self): chunksize = MIN_UPLOAD_CHUNKSIZE - 1 new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE) def test_unknown_file_size_above_maximum(self): chunksize = MAX_SINGLE_UPLOAD_SIZE + 1 new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)
class TestAdjustChunksize(unittest.TestCase): def setUp(self): self.adjuster = ChunksizeAdjuster() def test_valid_chunksize(self): chunksize = 7 * (1024 ** 2) file_size = 8 * (1024 ** 2) new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, chunksize) def test_chunksize_below_minimum(self): chunksize = MIN_UPLOAD_CHUNKSIZE - 1 file_size = 3 * MIN_UPLOAD_CHUNKSIZE new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE) def test_chunksize_above_maximum(self): chunksize = MAX_SINGLE_UPLOAD_SIZE + 1 file_size = MAX_SINGLE_UPLOAD_SIZE * 2 new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE) def test_chunksize_too_small(self): chunksize = 7 * (1024 ** 2) file_size = 5 * (1024 ** 4) # If we try to upload a 5TB file, we'll need to use 896MB part # sizes. new_size = self.adjuster.adjust_chunksize(chunksize, file_size) self.assertEqual(new_size, 896 * (1024 ** 2)) num_parts = file_size / new_size self.assertLessEqual(num_parts, MAX_PARTS) def test_unknown_file_size_with_valid_chunksize(self): chunksize = 7 * (1024 ** 2) new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, chunksize) def test_unknown_file_size_below_minimum(self): chunksize = MIN_UPLOAD_CHUNKSIZE - 1 new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, MIN_UPLOAD_CHUNKSIZE) def test_unknown_file_size_above_maximum(self): chunksize = MAX_SINGLE_UPLOAD_SIZE + 1 new_size = self.adjuster.adjust_chunksize(chunksize) self.assertEqual(new_size, MAX_SINGLE_UPLOAD_SIZE)
def _copy_remote_file(ctx, size, src_bucket, src_key, src_version, dest_bucket, dest_key, extra_args=None): src_params = dict( Bucket=src_bucket, Key=src_key ) if src_version is not None: src_params.update( VersionId=src_version ) s3_client = ctx.s3_client_provider.standard_client if size < s3_transfer_config.multipart_threshold: params = dict( CopySource=src_params, Bucket=dest_bucket, Key=dest_key, ) if extra_args: params.update(extra_args) resp = s3_client.copy_object(**params) ctx.progress(size) version_id = resp.get('VersionId') # Absent in unversioned buckets. ctx.done(PhysicalKey(dest_bucket, dest_key, version_id)) else: resp = s3_client.create_multipart_upload( Bucket=dest_bucket, Key=dest_key, ) upload_id = resp['UploadId'] adjuster = ChunksizeAdjuster() chunksize = adjuster.adjust_chunksize(s3_transfer_config.multipart_chunksize, size) chunk_offsets = list(range(0, size, chunksize)) lock = Lock() remaining = len(chunk_offsets) parts = [None] * remaining def upload_part(i, start, end): nonlocal remaining part_id = i + 1 part = s3_client.upload_part_copy( CopySource=src_params, CopySourceRange=f'bytes={start}-{end-1}', Bucket=dest_bucket, Key=dest_key, UploadId=upload_id, PartNumber=part_id ) with lock: parts[i] = {"PartNumber": part_id, "ETag": part["CopyPartResult"]["ETag"]} remaining -= 1 done = remaining == 0 ctx.progress(end - start) if done: resp = s3_client.complete_multipart_upload( Bucket=dest_bucket, Key=dest_key, UploadId=upload_id, MultipartUpload={"Parts": parts} ) version_id = resp.get('VersionId') # Absent in unversioned buckets. ctx.done(PhysicalKey(dest_bucket, dest_key, version_id)) for i, start in enumerate(chunk_offsets): end = min(start + chunksize, size) ctx.run(upload_part, i, start, end)
def _submit_multipart_request(self, client, config, osutil, request_executor, transfer_future): call_args = transfer_future.meta.call_args # Submit the request to create a multipart upload and make sure it # does not include any of the arguments used for copy part. create_multipart_extra_args = {} for param, val in call_args.extra_args.items(): if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST: create_multipart_extra_args[param] = val create_multipart_future = self._transfer_coordinator.submit( request_executor, CreateMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key, 'extra_args': create_multipart_extra_args, } ) ) # Determine how many parts are needed based on filesize and # desired chunksize. part_size = config.multipart_chunksize adjuster = ChunksizeAdjuster() part_size = adjuster.adjust_chunksize( part_size, transfer_future.meta.size) num_parts = int( math.ceil(transfer_future.meta.size / float(part_size))) # Submit requests to upload the parts of the file. part_futures = [] progress_callbacks = get_callbacks(transfer_future, 'progress') for part_number in range(1, num_parts + 1): extra_part_args = self._extra_upload_part_args( call_args.extra_args) # The part number for upload part starts at 1 while the # range parameter starts at zero, so just subtract 1 off of # the part number extra_part_args['CopySourceRange'] = calculate_range_parameter( part_size, part_number-1, num_parts, transfer_future.meta.size) # Get the size of the part copy as well for the progress # callbacks. size = self._get_transfer_size( part_size, part_number-1, num_parts, transfer_future.meta.size ) part_futures.append( self._transfer_coordinator.submit( request_executor, CopyPartTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'copy_source': call_args.copy_source, 'bucket': call_args.bucket, 'key': call_args.key, 'part_number': part_number, 'extra_args': extra_part_args, 'callbacks': progress_callbacks, 'size': size }, pending_main_kwargs={ 'upload_id': create_multipart_future } ) ) ) complete_multipart_extra_args = self._extra_complete_multipart_args( call_args.extra_args) # Submit the request to complete the multipart upload. self._transfer_coordinator.submit( request_executor, CompleteMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key, 'extra_args': complete_multipart_extra_args, }, pending_main_kwargs={ 'upload_id': create_multipart_future, 'parts': part_futures }, is_final=True ) )
def _submit_multipart_request(self, client, config, osutil, request_executor, transfer_future, upload_input_manager): call_args = transfer_future.meta.call_args # Submit the request to create a multipart upload. create_multipart_future = self._transfer_coordinator.submit( request_executor, CreateMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key, 'extra_args': call_args.extra_args, } ) ) # Submit requests to upload the parts of the file. part_futures = [] extra_part_args = self._extra_upload_part_args(call_args.extra_args) # Get any tags that need to be associated to the submitted task # for upload the data upload_part_tag = self._get_upload_task_tag( upload_input_manager, 'upload_part') size = transfer_future.meta.size adjuster = ChunksizeAdjuster() chunksize = adjuster.adjust_chunksize(config.multipart_chunksize, size) part_iterator = upload_input_manager.yield_upload_part_bodies( transfer_future, chunksize) for part_number, fileobj in part_iterator: part_futures.append( self._transfer_coordinator.submit( request_executor, UploadPartTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'fileobj': fileobj, 'bucket': call_args.bucket, 'key': call_args.key, 'part_number': part_number, 'extra_args': extra_part_args }, pending_main_kwargs={ 'upload_id': create_multipart_future } ), tag=upload_part_tag ) ) complete_multipart_extra_args = self._extra_complete_multipart_args( call_args.extra_args) # Submit the request to complete the multipart upload. self._transfer_coordinator.submit( request_executor, CompleteMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key, 'extra_args': complete_multipart_extra_args, }, pending_main_kwargs={ 'upload_id': create_multipart_future, 'parts': part_futures }, is_final=True ) )
def _submit_multipart_request(self, client, config, osutil, request_executor, transfer_future): call_args = transfer_future.meta.call_args # Submit the request to create a multipart upload and make sure it # does not include any of the arguments used for copy part. create_multipart_extra_args = {} for param, val in call_args.extra_args.items(): if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST: create_multipart_extra_args[param] = val create_multipart_future = self._transfer_coordinator.submit( request_executor, CreateMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key, 'extra_args': create_multipart_extra_args, } ) ) # Determine how many parts are needed based on filesize and # desired chunksize. part_size = config.multipart_chunksize adjuster = ChunksizeAdjuster() part_size = adjuster.adjust_chunksize( part_size, transfer_future.meta.size) num_parts = int( math.ceil(transfer_future.meta.size / float(part_size))) # Submit requests to upload the parts of the file. part_futures = [] progress_callbacks = get_callbacks(transfer_future, 'progress') for part_number in range(1, num_parts + 1): extra_part_args = self._extra_upload_part_args( call_args.extra_args) # The part number for upload part starts at 1 while the # range parameter starts at zero, so just subtract 1 off of # the part number extra_part_args['CopySourceRange'] = calculate_range_parameter( part_size, part_number-1, num_parts, transfer_future.meta.size) # Get the size of the part copy as well for the progress # callbacks. size = self._get_transfer_size( part_size, part_number-1, num_parts, transfer_future.meta.size ) part_futures.append( self._transfer_coordinator.submit( request_executor, CopyPartTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'copy_source': call_args.copy_source, 'bucket': call_args.bucket, 'key': call_args.key, 'part_number': part_number, 'extra_args': extra_part_args, 'callbacks': progress_callbacks, 'size': size }, pending_main_kwargs={ 'upload_id': create_multipart_future } ) ) ) # Submit the request to complete the multipart upload. self._transfer_coordinator.submit( request_executor, CompleteMultipartUploadTask( transfer_coordinator=self._transfer_coordinator, main_kwargs={ 'client': client, 'bucket': call_args.bucket, 'key': call_args.key }, pending_main_kwargs={ 'upload_id': create_multipart_future, 'parts': part_futures }, is_final=True ) )