Example #1
0
 def _stream_part( self, upload_id, part_num ):
     """
     Download o part from the source URL, buffer it in memory and then upload it to S3.
     """
     with self._propagate_worker_exception( ):
         try:
             log.info( 'part %i: downloading', part_num )
             buf = self._download_part( part_num )
         finally:
             download_slots_semaphore.release( )
         download_size = buf.tell( )
         self._log_progress( part_num, 'downloaded', download_size )
         if download_size > self.part_size:
             assert False
         elif download_size < self.part_size:
             done_event.set( )
         else:
             pass
         if error_event.is_set( ): raise BailoutException( )
         if download_size > 0 or part_num == 0:
             log.info( 'part %i: uploading', part_num )
             buf.seek( 0 )
             part = self._upload_part( upload_id, part_num, buf )
             upload_size = buf.tell( )
             assert download_size == upload_size == part.size
             self._log_progress( part_num, 'uploaded', upload_size )
         else:
             part = None
         return part_num, part
Example #2
0
 def _copy_part( self, upload_id, part_num, src_bucket_name, src_key_name, size ):
     with self._propagate_worker_exception( ):
         try:
             if error_event.is_set( ): raise BailoutException( )
             log.info( 'part %i: copying', part_num )
             start, end = self._get_part_range( part_num )
             end = min( end, size )
             part_size = end - start
             if part_size > 0 or part_num == 0:
                 url = urlparse( self.url )
                 assert url.scheme == 's3'
                 assert url.path.startswith( '/' )
                 with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3:
                     bucket = s3.get_bucket( self.bucket_name )
                     upload = self._get_upload( bucket, upload_id )
                     headers = { }
                     if self.sse_key:
                         self._add_encryption_headers( self.sse_key, headers )
                     if part_size == 0:
                         # Since copy_part_from_key doesn't allow empty ranges, we handle that
                         # case by uploading an empty part.
                         assert part_num == 0
                         # noinspection PyTypeChecker
                         key = upload.upload_part_from_file(
                             StringIO( ), part_num + 1,
                             headers=headers )
                     else:
                         if self.src_sse_key:
                             self._add_encryption_headers( self.src_sse_key, headers,
                                                           for_copy=True )
                         key = upload.copy_part_from_key(
                             src_bucket_name=src_bucket_name,
                             src_key_name=src_key_name,
                             part_num=part_num + 1,
                             start=start,
                             end=end - 1,
                             headers=headers )
                         # somehow copy_part_from_key doesn't set the key size
                         key.size = part_size
                 assert key.size == part_size
                 self._log_progress( part_num, 'copied', part_size )
                 return part_num, self._part_for_key( bucket, part_num, key )
             else:
                 done_event.set( )
                 return part_num, None
         finally:
             download_slots_semaphore.release( )
Example #3
0
 def _log_progress( self, part_num, task, download_size ):
     log.info( 'part %i: %s %sB (%i bytes)',
               part_num, task, bytes2human( download_size ), download_size )
Example #4
0
    def run( self ):
        """
        Stream a URL to a key in an S3 bucket using a parallelized multi-part upload.
        """
        global download_slots_semaphore, done_event, error_event

        download_slots_semaphore = multiprocessing.Semaphore( self.download_slots )
        done_event = multiprocessing.Event( )
        error_event = multiprocessing.Event( )

        upload_id, completed_parts = self._prepare_upload( )
        part_nums = itertools.count( )
        num_workers = self.download_slots + self.upload_slots
        workers = multiprocessing.Pool( num_workers, _init_worker )

        def complete_part( ( part_num_, part ) ):
            if part is not None:
                assert part_num_ not in completed_parts
                completed_parts[ part_num_ ] = part

        if self.url.startswith( 's3:' ):
            log.info( 'Copying %s' % self.url )
            url = urlparse( self.url )
            assert url.scheme == 's3'
            assert url.path.startswith( '/' )
            with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3:
                src_bucket = s3.get_bucket( url.netloc )
                headers = { }
                if self.src_sse_key:
                    self._add_encryption_headers( self.src_sse_key, headers )
                src_key = src_bucket.get_key( url.path[ 1: ], headers=headers )
            worker_func = self._copy_part
            kwargs = dict(
                src_bucket_name=src_bucket.name,
                src_key_name=src_key.name,
                size=src_key.size )
        else:
            log.info( 'Streaming %s' % self.url )
            kwargs = { }
            worker_func = self._stream_part

        try:
            while not done_event.is_set( ):
                if error_event.is_set( ):
                    raise WorkerException( )
                part_num = next( part_nums )
                if part_num in completed_parts:
                    assert self.resume
                    log.info( 'part %i: exists', part_num )
                else:
                    download_slots_semaphore.acquire( )
                    log.info( 'part %i: dispatching', part_num )
                    workers.apply_async( func=worker_func,
                                         args=[ upload_id, part_num ],
                                         kwds=kwargs,
                                         callback=complete_part )
            workers.close( )
            workers.join( )
            if error_event.is_set( ):
                raise WorkerException( )
            self._sanity_check( completed_parts )
            with closing( boto.s3.connect_to_region( self.bucket_location ) ) as s3:
                bucket = s3.get_bucket( self.bucket_name )
                upload = self._get_upload( bucket, upload_id, parts=completed_parts.values( ) )
                upload.complete_upload( )
            log.info( 'Completed %s' % self.url )
        except WorkerException:
            workers.close( )
            workers.join( )
            raise
        except (Exception, KeyboardInterrupt):
            workers.close( )
            workers.terminate( )
            raise