def _stream_part( self, upload_id, part_num ): """ Download o part from the source URL, buffer it in memory and then upload it to S3. """ with self._propagate_worker_exception( ): try: log.info( 'part %i: downloading', part_num ) buf = self._download_part( part_num ) finally: download_slots_semaphore.release( ) download_size = buf.tell( ) self._log_progress( part_num, 'downloaded', download_size ) if download_size > self.part_size: assert False elif download_size < self.part_size: done_event.set( ) else: pass if error_event.is_set( ): raise BailoutException( ) if download_size > 0 or part_num == 0: log.info( 'part %i: uploading', part_num ) buf.seek( 0 ) part = self._upload_part( upload_id, part_num, buf ) upload_size = buf.tell( ) assert download_size == upload_size == part.size self._log_progress( part_num, 'uploaded', upload_size ) else: part = None return part_num, part
def _copy_part( self, upload_id, part_num, src_bucket_name, src_key_name, size ): with self._propagate_worker_exception( ): try: if error_event.is_set( ): raise BailoutException( ) log.info( 'part %i: copying', part_num ) start, end = self._get_part_range( part_num ) end = min( end, size ) part_size = end - start if part_size > 0 or part_num == 0: url = urlparse( self.url ) assert url.scheme == 's3' assert url.path.startswith( '/' ) with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3: bucket = s3.get_bucket( self.bucket_name ) upload = self._get_upload( bucket, upload_id ) headers = { } if self.sse_key: self._add_encryption_headers( self.sse_key, headers ) if part_size == 0: # Since copy_part_from_key doesn't allow empty ranges, we handle that # case by uploading an empty part. assert part_num == 0 # noinspection PyTypeChecker key = upload.upload_part_from_file( StringIO( ), part_num + 1, headers=headers ) else: if self.src_sse_key: self._add_encryption_headers( self.src_sse_key, headers, for_copy=True ) key = upload.copy_part_from_key( src_bucket_name=src_bucket_name, src_key_name=src_key_name, part_num=part_num + 1, start=start, end=end - 1, headers=headers ) # somehow copy_part_from_key doesn't set the key size key.size = part_size assert key.size == part_size self._log_progress( part_num, 'copied', part_size ) return part_num, self._part_for_key( bucket, part_num, key ) else: done_event.set( ) return part_num, None finally: download_slots_semaphore.release( )
def _log_progress( self, part_num, task, download_size ): log.info( 'part %i: %s %sB (%i bytes)', part_num, task, bytes2human( download_size ), download_size )
def run( self ): """ Stream a URL to a key in an S3 bucket using a parallelized multi-part upload. """ global download_slots_semaphore, done_event, error_event download_slots_semaphore = multiprocessing.Semaphore( self.download_slots ) done_event = multiprocessing.Event( ) error_event = multiprocessing.Event( ) upload_id, completed_parts = self._prepare_upload( ) part_nums = itertools.count( ) num_workers = self.download_slots + self.upload_slots workers = multiprocessing.Pool( num_workers, _init_worker ) def complete_part( ( part_num_, part ) ): if part is not None: assert part_num_ not in completed_parts completed_parts[ part_num_ ] = part if self.url.startswith( 's3:' ): log.info( 'Copying %s' % self.url ) url = urlparse( self.url ) assert url.scheme == 's3' assert url.path.startswith( '/' ) with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3: src_bucket = s3.get_bucket( url.netloc ) headers = { } if self.src_sse_key: self._add_encryption_headers( self.src_sse_key, headers ) src_key = src_bucket.get_key( url.path[ 1: ], headers=headers ) worker_func = self._copy_part kwargs = dict( src_bucket_name=src_bucket.name, src_key_name=src_key.name, size=src_key.size ) else: log.info( 'Streaming %s' % self.url ) kwargs = { } worker_func = self._stream_part try: while not done_event.is_set( ): if error_event.is_set( ): raise WorkerException( ) part_num = next( part_nums ) if part_num in completed_parts: assert self.resume log.info( 'part %i: exists', part_num ) else: download_slots_semaphore.acquire( ) log.info( 'part %i: dispatching', part_num ) workers.apply_async( func=worker_func, args=[ upload_id, part_num ], kwds=kwargs, callback=complete_part ) workers.close( ) workers.join( ) if error_event.is_set( ): raise WorkerException( ) self._sanity_check( completed_parts ) with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3: bucket = s3.get_bucket( self.bucket_name ) upload = self._get_upload( bucket, upload_id, parts=completed_parts.values( ) ) upload.complete_upload( ) log.info( 'Completed %s' % self.url ) except WorkerException: workers.close( ) workers.join( ) raise except (Exception, KeyboardInterrupt): workers.close( ) workers.terminate( ) raise