class BaseSubmissionTaskTest(BaseTaskTest): def setUp(self): super(BaseSubmissionTaskTest, self).setUp() self.config = TransferConfig() self.osutil = OSUtils() self.executor = BoundedExecutor( 1000, 1, { IN_MEMORY_UPLOAD_TAG: TaskSemaphore(10), IN_MEMORY_DOWNLOAD_TAG: SlidingWindowSemaphore(10) }) def tearDown(self): super(BaseSubmissionTaskTest, self).tearDown() self.executor.shutdown()
def test_association_and_disassociation_on_submit(self): self.transfer_coordinator = RecordingTransferCoordinator() # Submit a callable to the transfer coordinator. executor = BoundedExecutor(1, 1) task = ReturnFooTask(self.transfer_coordinator) future = self.transfer_coordinator.submit(executor, task) executor.shutdown() # Make sure the future that got submitted was associated to the # transfer future at some point. self.assertEqual( self.transfer_coordinator.all_transfer_futures_ever_associated, set([future])) # Make sure the future got disassociated once the future is now done # by looking at the currently associated futures. self.assertEqual(self.transfer_coordinator.associated_futures, set([]))
def test_submit_writes_from_internal_queue(self): class FakeQueue(object): def request_writes(self, offset, data): return [ { 'offset': 0, 'data': 'foo' }, { 'offset': 3, 'data': 'bar' }, ] q = FakeQueue() io_executor = BoundedExecutor(1000, 1) manager = DownloadNonSeekableOutputManager(self.osutil, self.transfer_coordinator, io_executor=io_executor, defer_queue=q) fileobj = WriteCollector() manager.queue_file_io_task(fileobj=fileobj, data='foo', offset=1) io_executor.shutdown() self.assertEqual(fileobj.writes, [(0, 'foo'), (3, 'bar')])
class TransferManager(object): ALLOWED_DOWNLOAD_ARGS = ALLOWED_DOWNLOAD_ARGS ALLOWED_UPLOAD_ARGS = [ 'ACL', 'CacheControl', 'ContentDisposition', 'ContentEncoding', 'ContentLanguage', 'ContentType', 'Expires', 'GrantFullControl', 'GrantRead', 'GrantReadACP', 'GrantWriteACP', 'Metadata', 'RequestPayer', 'ServerSideEncryption', 'StorageClass', 'SSECustomerAlgorithm', 'SSECustomerKey', 'SSECustomerKeyMD5', 'SSEKMSKeyId', 'WebsiteRedirectLocation', 'RetentionExpirationDate', 'RetentionLegalHoldId', 'RetentionPeriod', ] ALLOWED_COPY_ARGS = ALLOWED_UPLOAD_ARGS + [ 'CopySourceIfMatch', 'CopySourceIfModifiedSince', 'CopySourceIfNoneMatch', 'CopySourceIfUnmodifiedSince', 'CopySourceSSECustomerAlgorithm', 'CopySourceSSECustomerKey', 'CopySourceSSECustomerKeyMD5', 'MetadataDirective' ] ALLOWED_DELETE_ARGS = [ 'MFA', 'VersionId', 'RequestPayer', ] def __init__(self, client, config=None, osutil=None, executor_cls=None): """A transfer manager interface for Amazon S3 :param client: Client to be used by the manager :param config: TransferConfig to associate specific configurations :param osutil: OSUtils object to use for os-related behavior when using with transfer manager. :type executor_cls: ibm_s3transfer.futures.BaseExecutor :param executor_cls: The class of executor to use with the transfer manager. By default, concurrent.futures.ThreadPoolExecutor is used. """ self._client = client self._config = config if config is None: self._config = TransferConfig() self._osutil = osutil if osutil is None: self._osutil = OSUtils() self._coordinator_controller = TransferCoordinatorController() # A counter to create unique id's for each transfer submitted. self._id_counter = 0 # The executor responsible for making S3 API transfer requests self._request_executor = BoundedExecutor( max_size=self._config.max_request_queue_size, max_num_threads=self._config.max_request_concurrency, tag_semaphores={ IN_MEMORY_UPLOAD_TAG: TaskSemaphore(self._config.max_in_memory_upload_chunks), IN_MEMORY_DOWNLOAD_TAG: SlidingWindowSemaphore( self._config.max_in_memory_download_chunks) }, executor_cls=executor_cls) # The executor responsible for submitting the necessary tasks to # perform the desired transfer self._submission_executor = BoundedExecutor( max_size=self._config.max_submission_queue_size, max_num_threads=self._config.max_submission_concurrency, executor_cls=executor_cls) # There is one thread available for writing to disk. It will handle # downloads for all files. self._io_executor = BoundedExecutor( max_size=self._config.max_io_queue_size, max_num_threads=1, executor_cls=executor_cls) # The component responsible for limiting bandwidth usage if it # is configured. self._bandwidth_limiter = None if self._config.max_bandwidth is not None: logger.debug('Setting max_bandwidth to %s', self._config.max_bandwidth) leaky_bucket = LeakyBucket(self._config.max_bandwidth) self._bandwidth_limiter = BandwidthLimiter(leaky_bucket) self._register_handlers() def upload(self, fileobj, bucket, key, extra_args=None, subscribers=None): """Uploads a file to S3 :type fileobj: str or seekable file-like object :param fileobj: The name of a file to upload or a seekable file-like object to upload. It is recommended to use a filename because file-like objects may result in higher memory usage. :type bucket: str :param bucket: The name of the bucket to upload to :type key: str :param key: The name of the key to upload to :type extra_args: dict :param extra_args: Extra arguments that may be passed to the client operation :type subscribers: list(ibm_s3transfer.subscribers.BaseSubscriber) :param subscribers: The list of subscribers to be invoked in the order provided based on the event emit during the process of the transfer request. :rtype: ibm_s3transfer.futures.TransferFuture :returns: Transfer future representing the upload """ if extra_args is None: extra_args = {} if subscribers is None: subscribers = [] self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS) call_args = CallArgs(fileobj=fileobj, bucket=bucket, key=key, extra_args=extra_args, subscribers=subscribers) extra_main_kwargs = {} if self._bandwidth_limiter: extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter return self._submit_transfer(call_args, UploadSubmissionTask, extra_main_kwargs) def download(self, bucket, key, fileobj, extra_args=None, subscribers=None): """Downloads a file from S3 :type bucket: str :param bucket: The name of the bucket to download from :type key: str :param key: The name of the key to download from :type fileobj: str or seekable file-like object :param fileobj: The name of a file to download or a seekable file-like object to download. It is recommended to use a filename because file-like objects may result in higher memory usage. :type extra_args: dict :param extra_args: Extra arguments that may be passed to the client operation :type subscribers: list(ibm_s3transfer.subscribers.BaseSubscriber) :param subscribers: The list of subscribers to be invoked in the order provided based on the event emit during the process of the transfer request. :rtype: ibm_s3transfer.futures.TransferFuture :returns: Transfer future representing the download """ if extra_args is None: extra_args = {} if subscribers is None: subscribers = [] self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS) call_args = CallArgs(bucket=bucket, key=key, fileobj=fileobj, extra_args=extra_args, subscribers=subscribers) extra_main_kwargs = {'io_executor': self._io_executor} if self._bandwidth_limiter: extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter return self._submit_transfer(call_args, DownloadSubmissionTask, extra_main_kwargs) def copy(self, copy_source, bucket, key, extra_args=None, subscribers=None, source_client=None): """Copies a file in S3 :type copy_source: dict :param copy_source: The name of the source bucket, key name of the source object, and optional version ID of the source object. The dictionary format is: ``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note that the ``VersionId`` key is optional and may be omitted. :type bucket: str :param bucket: The name of the bucket to copy to :type key: str :param key: The name of the key to copy to :type extra_args: dict :param extra_args: Extra arguments that may be passed to the client operation :type subscribers: a list of subscribers :param subscribers: The list of subscribers to be invoked in the order provided based on the event emit during the process of the transfer request. :type source_client: ibm_botocore or ibm_boto3 Client :param source_client: The client to be used for operation that may happen at the source object. For example, this client is used for the head_object that determines the size of the copy. If no client is provided, the transfer manager's client is used as the client for the source object. :rtype: ibm_s3transfer.futures.TransferFuture :returns: Transfer future representing the copy """ if extra_args is None: extra_args = {} if subscribers is None: subscribers = [] if source_client is None: source_client = self._client self._validate_all_known_args(extra_args, self.ALLOWED_COPY_ARGS) call_args = CallArgs(copy_source=copy_source, bucket=bucket, key=key, extra_args=extra_args, subscribers=subscribers, source_client=source_client) return self._submit_transfer(call_args, CopySubmissionTask) def delete(self, bucket, key, extra_args=None, subscribers=None): """Delete an S3 object. :type bucket: str :param bucket: The name of the bucket. :type key: str :param key: The name of the S3 object to delete. :type extra_args: dict :param extra_args: Extra arguments that may be passed to the DeleteObject call. :type subscribers: list :param subscribers: A list of subscribers to be invoked during the process of the transfer request. Note that the ``on_progress`` callback is not invoked during object deletion. :rtype: ibm_s3transfer.futures.TransferFuture :return: Transfer future representing the deletion. """ if extra_args is None: extra_args = {} if subscribers is None: subscribers = [] self._validate_all_known_args(extra_args, self.ALLOWED_DELETE_ARGS) call_args = CallArgs(bucket=bucket, key=key, extra_args=extra_args, subscribers=subscribers) return self._submit_transfer(call_args, DeleteSubmissionTask) def _validate_all_known_args(self, actual, allowed): for kwarg in actual: if kwarg not in allowed: raise ValueError("Invalid extra_args key '%s', " "must be one of: %s" % (kwarg, ', '.join(allowed))) def _submit_transfer(self, call_args, submission_task_cls, extra_main_kwargs=None): if not extra_main_kwargs: extra_main_kwargs = {} # Create a TransferFuture to return back to the user transfer_future, components = self._get_future_with_components( call_args) # Add any provided done callbacks to the created transfer future # to be invoked on the transfer future being complete. for callback in get_callbacks(transfer_future, 'done'): components['coordinator'].add_done_callback(callback) # Get the main kwargs needed to instantiate the submission task main_kwargs = self._get_submission_task_main_kwargs( transfer_future, extra_main_kwargs) # Submit a SubmissionTask that will submit all of the necessary # tasks needed to complete the S3 transfer. self._submission_executor.submit( submission_task_cls(transfer_coordinator=components['coordinator'], main_kwargs=main_kwargs)) # Increment the unique id counter for future transfer requests self._id_counter += 1 return transfer_future def _get_future_with_components(self, call_args): transfer_id = self._id_counter # Creates a new transfer future along with its components transfer_coordinator = TransferCoordinator(transfer_id=transfer_id) # Track the transfer coordinator for transfers to manage. self._coordinator_controller.add_transfer_coordinator( transfer_coordinator) # Also make sure that the transfer coordinator is removed once # the transfer completes so it does not stick around in memory. transfer_coordinator.add_done_callback( self._coordinator_controller.remove_transfer_coordinator, transfer_coordinator) components = { 'meta': TransferMeta(call_args, transfer_id=transfer_id), 'coordinator': transfer_coordinator } transfer_future = TransferFuture(**components) return transfer_future, components def _get_submission_task_main_kwargs(self, transfer_future, extra_main_kwargs): main_kwargs = { 'client': self._client, 'config': self._config, 'osutil': self._osutil, 'request_executor': self._request_executor, 'transfer_future': transfer_future } main_kwargs.update(extra_main_kwargs) return main_kwargs def _register_handlers(self): # Register handlers to enable/disable callbacks on uploads. event_name = 'request-created.s3' self._client.meta.events.register_first( event_name, signal_not_transferring, unique_id='s3upload-not-transferring') self._client.meta.events.register_last( event_name, signal_transferring, unique_id='s3upload-transferring') def __enter__(self): return self def __exit__(self, exc_type, exc_value, *args): cancel = False cancel_msg = '' cancel_exc_type = FatalError # If a exception was raised in the context handler, signal to cancel # all of the inprogress futures in the shutdown. if exc_type: cancel = True cancel_msg = six.text_type(exc_value) if not cancel_msg: cancel_msg = repr(exc_value) # If it was a KeyboardInterrupt, the cancellation was initiated # by the user. if isinstance(exc_value, KeyboardInterrupt): cancel_exc_type = CancelledError self._shutdown(cancel, cancel_msg, cancel_exc_type) def shutdown(self, cancel=False, cancel_msg=''): """Shutdown the TransferManager It will wait till all transfers complete before it completely shuts down. :type cancel: boolean :param cancel: If True, calls TransferFuture.cancel() for all in-progress in transfers. This is useful if you want the shutdown to happen quicker. :type cancel_msg: str :param cancel_msg: The message to specify if canceling all in-progress transfers. """ self._shutdown(cancel, cancel, cancel_msg) def _shutdown(self, cancel, cancel_msg, exc_type=CancelledError): if cancel: # Cancel all in-flight transfers if requested, before waiting # for them to complete. self._coordinator_controller.cancel(cancel_msg, exc_type) try: # Wait until there are no more in-progress transfers. This is # wrapped in a try statement because this can be interrupted # with a KeyboardInterrupt that needs to be caught. self._coordinator_controller.wait() except KeyboardInterrupt: # If not errors were raised in the try block, the cancel should # have no coordinators it needs to run cancel on. If there was # an error raised in the try statement we want to cancel all of # the inflight transfers before shutting down to speed that # process up. self._coordinator_controller.cancel('KeyboardInterrupt()') raise finally: # Shutdown all of the executors. self._submission_executor.shutdown() self._request_executor.shutdown() self._io_executor.shutdown()
class TestGetObjectTask(BaseTaskTest): def setUp(self): super(TestGetObjectTask, self).setUp() self.bucket = 'mybucket' self.key = 'mykey' self.extra_args = {} self.callbacks = [] self.max_attempts = 5 self.io_executor = BoundedExecutor(1000, 1) self.content = b'my content' self.stream = six.BytesIO(self.content) self.fileobj = WriteCollector() self.osutil = OSUtils() self.io_chunksize = 64 * (1024**2) self.task_cls = GetObjectTask self.download_output_manager = DownloadSeekableOutputManager( self.osutil, self.transfer_coordinator, self.io_executor) def get_download_task(self, **kwargs): default_kwargs = { 'client': self.client, 'bucket': self.bucket, 'key': self.key, 'fileobj': self.fileobj, 'extra_args': self.extra_args, 'callbacks': self.callbacks, 'max_attempts': self.max_attempts, 'download_output_manager': self.download_output_manager, 'io_chunksize': self.io_chunksize, } default_kwargs.update(kwargs) self.transfer_coordinator.set_status_to_queued() return self.get_task(self.task_cls, main_kwargs=default_kwargs) def assert_io_writes(self, expected_writes): # Let the io executor process all of the writes before checking # what writes were sent to it. self.io_executor.shutdown() self.assertEqual(self.fileobj.writes, expected_writes) def test_main(self): self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task() task() self.stubber.assert_no_pending_responses() self.assert_io_writes([(0, self.content)]) def test_extra_args(self): self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key, 'Range': 'bytes=0-' }) self.extra_args['Range'] = 'bytes=0-' task = self.get_download_task() task() self.stubber.assert_no_pending_responses() self.assert_io_writes([(0, self.content)]) def test_control_chunk_size(self): self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task(io_chunksize=1) task() self.stubber.assert_no_pending_responses() expected_contents = [] for i in range(len(self.content)): expected_contents.append((i, bytes(self.content[i:i + 1]))) self.assert_io_writes(expected_contents) def test_start_index(self): self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task(start_index=5) task() self.stubber.assert_no_pending_responses() self.assert_io_writes([(5, self.content)]) def test_uses_bandwidth_limiter(self): bandwidth_limiter = mock.Mock(BandwidthLimiter) self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task(bandwidth_limiter=bandwidth_limiter) task() self.stubber.assert_no_pending_responses() self.assertEqual( bandwidth_limiter.get_bandwith_limited_stream.call_args_list, [mock.call(mock.ANY, self.transfer_coordinator)]) def test_retries_succeeds(self): self.stubber.add_response('get_object', service_response={ 'Body': StreamWithError(self.stream, SOCKET_ERROR) }, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task() task() # Retryable error should have not affected the bytes placed into # the io queue. self.stubber.assert_no_pending_responses() self.assert_io_writes([(0, self.content)]) def test_retries_failure(self): for _ in range(self.max_attempts): self.stubber.add_response('get_object', service_response={ 'Body': StreamWithError( self.stream, SOCKET_ERROR) }, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task() task() self.transfer_coordinator.announce_done() # Should have failed out on a RetriesExceededError with self.assertRaises(RetriesExceededError): self.transfer_coordinator.result() self.stubber.assert_no_pending_responses() def test_retries_in_middle_of_streaming(self): # After the first read a retryable error will be thrown self.stubber.add_response('get_object', service_response={ 'Body': StreamWithError( copy.deepcopy(self.stream), SOCKET_ERROR, 1) }, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) self.stubber.add_response('get_object', service_response={'Body': self.stream}, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task(io_chunksize=1) task() self.stubber.assert_no_pending_responses() expected_contents = [] # This is the content intially read in before the retry hit on the # second read() expected_contents.append((0, bytes(self.content[0:1]))) # The rest of the content should be the entire set of data partitioned # out based on the one byte stream chunk size. Note the second # element in the list should be a copy of the first element since # a retryable exception happened in between. for i in range(len(self.content)): expected_contents.append((i, bytes(self.content[i:i + 1]))) self.assert_io_writes(expected_contents) def test_cancels_out_of_queueing(self): self.stubber.add_response('get_object', service_response={ 'Body': CancelledStreamWrapper( self.stream, self.transfer_coordinator) }, expected_params={ 'Bucket': self.bucket, 'Key': self.key }) task = self.get_download_task() task() self.stubber.assert_no_pending_responses() # Make sure that no contents were added to the queue because the task # should have been canceled before trying to add the contents to the # io queue. self.assert_io_writes([])