Ejemplo n.º 1
0
 def multi_upload(self):
     """
     Performs multipart uploads.  It initiates the multipart upload.
     It creates a queue ``part_queue`` which is directly responsible
     with controlling the progress of the multipart upload.  It then
     creates ``UploadPartTasks`` for threads to run via the
     ``executer``.  This fucntion waits for all of the parts in the
     multipart upload to finish, and then it completes the multipart
     upload.  This method waits on its parts to finish.  So, threads
     are required to process the parts for this function to complete.
     """
     part_queue = NoBlockQueue(self.interrupt)
     complete_upload_queue = Queue.PriorityQueue()
     part_counter = MultiCounter()
     counter_lock = threading.Lock()
     bucket, key = find_bucket_key(self.dest)
     params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key}
     if self.parameters['acl']:
         params['acl'] = self.parameters['acl'][0]
     if self.parameters['guess_mime_type']:
         self._inject_content_type(params, self.src)
     response_data, http = operate(self.service, 'CreateMultipartUpload',
                                   params)
     upload_id = response_data['UploadId']
     size_uploads = self.chunksize
     num_uploads = int(math.ceil(self.size/float(size_uploads)))
     for i in range(1, (num_uploads + 1)):
         part_info = (self, upload_id, i, size_uploads)
         part_queue.put(part_info)
         task = UploadPartTask(session=self.session, executer=self.executer,
                               part_queue=part_queue,
                               dest_queue=complete_upload_queue,
                               region=self.region,
                               printQueue=self.printQueue,
                               interrupt=self.interrupt,
                               part_counter=part_counter,
                               counter_lock=counter_lock)
         self.executer.submit(task)
     part_queue.join()
     # The following ensures that if the multipart upload is in progress,
     # all part uploads finish before aborting or completing.  This
     # really only applies when an interrupt signal is sent because the
     # ``part_queue.join()`` ensures this if the process is not
     # interrupted.
     while part_counter.count:
         time.sleep(0.1)
     parts_list = []
     while not complete_upload_queue.empty():
         part = complete_upload_queue.get()
         parts_list.append(part[1])
     if len(parts_list) == num_uploads:
         parts = {'Parts': parts_list}
         params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key,
                   'upload_id': upload_id, 'multipart_upload': parts}
         operate(self.service, 'CompleteMultipartUpload', params)
     else:
         abort_params = {'endpoint': self.endpoint, 'bucket': bucket,
                         'key': key, 'upload_id': upload_id}
         operate(self.service, 'AbortMultipartUpload', abort_params)
         raise Exception()
Ejemplo n.º 2
0
 def multi_download(self):
     """
     This performs the multipart download.  It assigns ranges to get from
     s3 of particular object to a task.It creates a queue ``part_queue``
     which is directly responsible with controlling the progress of the
     multipart download.  It then creates ``DownloadPartTasks`` for
     threads to run via the ``executer``. This fucntion waits
     for all of the parts in the multipart download to finish, and then
     the last modification time is changed to the last modified time
     of the s3 object.  This method waits on its parts to finish.
     So, threads are required to process the parts for this function
     to complete.
     """
     part_queue = NoBlockQueue(self.interrupt)
     dest_queue = NoBlockQueue(self.interrupt)
     part_counter = MultiCounter()
     write_lock = threading.Lock()
     counter_lock = threading.Lock()
     d = os.path.dirname(self.dest)
     try:
         if not os.path.exists(d):
             os.makedirs(d)
     except Exception:
         pass
     size_uploads = self.chunksize
     num_uploads = int(self.size / size_uploads)
     with open(self.dest, 'wb') as f:
         for i in range(num_uploads):
             part = (self, i, size_uploads)
             part_queue.put(part)
             task = DownloadPartTask(session=self.session,
                                     executer=self.executer,
                                     part_queue=part_queue,
                                     dest_queue=dest_queue,
                                     f=f,
                                     region=self.region,
                                     printQueue=self.printQueue,
                                     write_lock=write_lock,
                                     part_counter=part_counter,
                                     counter_lock=counter_lock)
             self.executer.submit(task)
         part_queue.join()
         # The following ensures that if the multipart download is
         # in progress, all part uploads finish before releasing the
         # the file handle.  This really only applies when an interrupt
         # signal is sent because the ``part_queue.join()`` ensures this
         # if the process is not interrupted.
         while part_counter.count:
             time.sleep(0.1)
     part_list = []
     while not dest_queue.empty():
         part = dest_queue.get()
         part_list.append(part)
     if len(part_list) != num_uploads:
         raise Exception()
     last_update_tuple = self.last_update.timetuple()
     mod_timestamp = time.mktime(last_update_tuple)
     os.utime(self.dest, (int(mod_timestamp), int(mod_timestamp)))
Ejemplo n.º 3
0
class Executer(object):
    """
    This class is in charge of all of the threads.  It starts up the threads
    and cleans up the threads when done.  The two type of threads the
    ``Executer``runs is a worker and a print thread.
    """
    def __init__(self, done, num_threads, timeout,
                 print_queue, quiet, interrupt, max_multi, max_queue_size):
        self.queue = None
        self.done = done
        self.num_threads = num_threads
        self.timeout = timeout
        self.print_queue = print_queue
        self.quiet = quiet
        self.interrupt = interrupt
        self.threads_list = []
        self.max_multi = max_multi
        self.multi_lock = threading.Lock()
        self.multi_counter = MultiCounter()
        self._max_queue_size = max_queue_size

    def start(self):
        self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size)
        self.multi_counter.count = 0
        self.print_thread = PrintThread(self.print_queue, self.done,
                                        self.quiet, self.interrupt,
                                        self.timeout)
        self.print_thread.setDaemon(True)
        self.threads_list.append(self.print_thread)
        self.print_thread.start()
        for i in range(self.num_threads):
            worker = Worker(queue=self.queue, done=self.done,
                            timeout=self.timeout, multi_lock=self.multi_lock,
                            multi_counter=self.multi_counter,
                            max_multi=self.max_multi)
            worker.setDaemon(True)
            self.threads_list.append(worker)
            worker.start()

    def submit(self, task):
        """
        This is the function used to submit a task to the ``Executer``.
        """
        self.queue.put(task)

    def wait(self):
        """
        This is the function used to wait on all of the tasks to finish
        in the ``Executer``.
        """
        self.queue.join()

    def join(self):
        """
        This is used to clean up the ``Executer``.
        """
        for thread in self.threads_list:
            thread.join()
Ejemplo n.º 4
0
class Executer(object):
    """
    This class is in charge of all of the threads.  It starts up the threads
    and cleans up the threads when done.  The two type of threads the
    ``Executer``runs is a worker and a print thread.
    """
    def __init__(self, done, num_threads, timeout, printQueue, quiet,
                 interrupt, max_multi):
        self.queue = None
        self.done = done
        self.num_threads = num_threads
        self.timeout = timeout
        self.printQueue = printQueue
        self.quiet = quiet
        self.interrupt = interrupt
        self.threads_list = []
        self.max_multi = max_multi
        self.multi_lock = threading.Lock()
        self.multi_counter = MultiCounter()

    def start(self):
        self.queue = NoBlockQueue(self.interrupt)
        self.multi_counter.count = 0
        self.print_thread = PrintThread(self.printQueue, self.done, self.quiet,
                                        self.interrupt, self.timeout)
        self.print_thread.setDaemon(True)
        self.threads_list.append(self.print_thread)
        self.print_thread.start()
        for i in range(self.num_threads):
            worker = Worker(queue=self.queue,
                            done=self.done,
                            timeout=self.timeout,
                            multi_lock=self.multi_lock,
                            multi_counter=self.multi_counter,
                            max_multi=self.max_multi)
            worker.setDaemon(True)
            self.threads_list.append(worker)
            worker.start()

    def submit(self, task):
        """
        This is the function used to submit a task to the ``Executer``.
        """
        self.queue.put(task)

    def wait(self):
        """
        This is the function used to wait on all of the tasks to finish
        in the ``Executer``.
        """
        self.queue.join()

    def join(self):
        """
        This is used to clean up the ``Executer``.
        """
        for thread in self.threads_list:
            thread.join()
Ejemplo n.º 5
0
 def multi_download(self):
     """
     This performs the multipart download.  It assigns ranges to get from
     s3 of particular object to a task.It creates a queue ``part_queue``
     which is directly responsible with controlling the progress of the
     multipart download.  It then creates ``DownloadPartTasks`` for
     threads to run via the ``executer``. This fucntion waits
     for all of the parts in the multipart download to finish, and then
     the last modification time is changed to the last modified time
     of the s3 object.  This method waits on its parts to finish.
     So, threads are required to process the parts for this function
     to complete.
     """
     part_queue = NoBlockQueue(self.interrupt)
     dest_queue = NoBlockQueue(self.interrupt)
     part_counter = MultiCounter()
     write_lock = threading.Lock()
     counter_lock = threading.Lock()
     d = os.path.dirname(self.dest)
     try:
         if not os.path.exists(d):
             os.makedirs(d)
     except Exception:
         pass
     size_uploads = self.chunksize
     num_uploads = int(self.size/size_uploads)
     with open(self.dest, 'wb') as f:
         for i in range(num_uploads):
             part = (self, i, size_uploads)
             part_queue.put(part)
             task = DownloadPartTask(session=self.session,
                                     executer=self.executer,
                                     part_queue=part_queue,
                                     dest_queue=dest_queue,
                                     f=f, region=self.region,
                                     printQueue=self.printQueue,
                                     write_lock=write_lock,
                                     part_counter=part_counter,
                                     counter_lock=counter_lock)
             self.executer.submit(task)
         part_queue.join()
         # The following ensures that if the multipart download is
         # in progress, all part uploads finish before releasing the
         # the file handle.  This really only applies when an interrupt
         # signal is sent because the ``part_queue.join()`` ensures this
         # if the process is not interrupted.
         while part_counter.count:
             time.sleep(0.1)
     part_list = []
     while not dest_queue.empty():
         part = dest_queue.get()
         part_list.append(part)
     if len(part_list) != num_uploads:
         raise Exception()
     last_update_tuple = self.last_update.timetuple()
     mod_timestamp = time.mktime(last_update_tuple)
     os.utime(self.dest, (int(mod_timestamp), int(mod_timestamp)))
Ejemplo n.º 6
0
class Executer(object):
    """
    This class is in charge of all of the threads.  It starts up the threads
    and cleans up the threads when done.  The two type of threads the
    ``Executer``runs is a worker and a print thread.
    """
    def __init__(self, done, num_threads, result_queue,
                 quiet, interrupt, max_queue_size):
        self.queue = None
        self.done = done
        self.num_threads = num_threads
        self.result_queue = result_queue
        self.quiet = quiet
        self.interrupt = interrupt
        self.threads_list = []
        self._max_queue_size = max_queue_size
        self.print_thread = None

    @property
    def num_tasks_failed(self):
        tasks_failed = 0
        if self.print_thread is not None:
            tasks_failed = self.print_thread.num_errors_seen
        return tasks_failed

    def start(self):
        self.print_thread = PrintThread(self.result_queue, self.done,
                                        self.quiet, self.interrupt)
        self.print_thread.daemon = True
        self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size)
        self.threads_list.append(self.print_thread)
        self.print_thread.start()
        for i in range(self.num_threads):
            worker = Worker(queue=self.queue, done=self.done)
            worker.setDaemon(True)
            self.threads_list.append(worker)
            worker.start()

    def submit(self, task):
        """
        This is the function used to submit a task to the ``Executer``.
        """
        LOGGER.debug("Submitting task: %s", task)
        self.queue.put(task)

    def wait(self):
        """
        This is the function used to wait on all of the tasks to finish
        in the ``Executer``.
        """
        self.queue.join()

    def join(self):
        """
        This is used to clean up the ``Executer``.
        """
        self.result_queue.put(QUEUE_END_SENTINEL)
        for i in range(self.num_threads):
            self.queue.put(QUEUE_END_SENTINEL)

        for thread in self.threads_list:
            thread.join()
Ejemplo n.º 7
0
class S3Handler(object):
    """
    This class sets up the process to perform the tasks sent to it.  It
    sources the ``self.executer`` from which threads inside the
    class pull tasks from to complete.
    """
    def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD,
                 chunksize=CHUNKSIZE):
        self.session = session
        self.done = threading.Event()
        self.interrupt = threading.Event()
        self.result_queue = NoBlockQueue()
        self.params = {'dryrun': False, 'quiet': False, 'acl': None,
                       'guess_mime_type': True, 'sse': False,
                       'storage_class': None, 'website_redirect': None,
                       'content_type': None, 'cache_control': None,
                       'content_disposition': None, 'content_encoding': None,
                       'content_language': None, 'expires': None,
                       'grants': None}
        self.params['region'] = params['region']
        for key in self.params.keys():
            if key in params:
                self.params[key] = params[key]
        self.multi_threshold = multi_threshold
        self.chunksize = chunksize
        self.executer = Executer(
            done=self.done, num_threads=NUM_THREADS,
            timeout=QUEUE_TIMEOUT_GET, result_queue=self.result_queue,
            quiet=self.params['quiet'], interrupt=self.interrupt,
            max_queue_size=MAX_QUEUE_SIZE,
        )
        self._multipart_uploads = []
        self._multipart_downloads = []

    def call(self, files):
        """
        This function pulls a ``FileInfo`` or ``TaskInfo`` object from
        a list ``files``.  Each object is then deemed if it will be a
        multipart operation and add the necessary attributes if so.  Each
        object is then wrapped with a ``BasicTask`` object which is
        essentially a thread of execution for a thread to follow.  These
        tasks are then submitted to the main executer.
        """
        self.done.clear()
        self.interrupt.clear()
        try:
            self.executer.start()
            total_files, total_parts = self._enqueue_tasks(files)
            self.executer.print_thread.set_total_files(total_files)
            self.executer.print_thread.set_total_parts(total_parts)
            self.executer.wait()
            self.result_queue.join()

        except Exception as e:
            LOGGER.debug('Exception caught during task execution: %s',
                         str(e), exc_info=True)
            self.result_queue.put({'message': str(e), 'error': True})
        except KeyboardInterrupt:
            self.interrupt.set()
            self.result_queue.put({'message': "Cleaning up. Please wait...",
                                   'error': False})
        self._shutdown()
        return self.executer.num_tasks_failed

    def _shutdown(self):
        # self.done will tell threads to shutdown.
        self.done.set()
        # This waill wait until all the threads are joined.
        self.executer.join()
        # And finally we need to make a pass through all the existing
        # multipart uploads and abort any pending multipart uploads.
        self._abort_pending_multipart_uploads()
        self._remove_pending_downloads()

    def _abort_pending_multipart_uploads(self):
        # For the purpose of aborting uploads, we consider any
        # upload context with an upload id.
        for upload, filename in self._multipart_uploads:
            if upload.is_cancelled():
                try:
                    upload.wait_for_upload_id()
                except tasks.UploadCancelledError:
                    pass
                else:
                    # This means that the upload went from STARTED -> CANCELLED.
                    # This could happen if a part thread decided to cancel the
                    # upload.  We need to explicitly abort the upload here.
                    self._cancel_upload(upload.wait_for_upload_id(), filename)
            upload.cancel_upload(self._cancel_upload, args=(filename,))

    def _remove_pending_downloads(self):
        # The downloads case is easier than the uploads case because we don't
        # need to make any service calls.  To properly cleanup we just need
        # to go through the multipart downloads that were in progress but
        # cancelled and remove the local file.
        for context, local_filename in self._multipart_downloads:
            if (context.is_cancelled() or context.is_started()) and \
                    os.path.exists(local_filename):
                # The file is in an inconsistent state (not all the parts
                # were written to the file) so we should remove the
                # local file rather than leave it in a bad state.  We don't
                # want to remove the files if the download has *not* been
                # started because we haven't touched the file yet, so it's
                # better to leave the old version of the file rather than
                # deleting the file entirely.
                os.remove(local_filename)

    def _cancel_upload(self, upload_id, filename):
        bucket, key = find_bucket_key(filename.dest)
        params = {
            'bucket': bucket,
            'key': key,
            'upload_id': upload_id,
            'endpoint': filename.endpoint,
        }
        LOGGER.debug("Aborting multipart upload for: %s", key)
        response_data, http = operate(
            filename.service, 'AbortMultipartUpload', params)

    def _enqueue_tasks(self, files):
        total_files = 0
        total_parts = 0
        for filename in files:
            num_uploads = 1
            is_multipart_task = self._is_multipart_task(filename)
            too_large = False
            if hasattr(filename, 'size'):
                too_large = filename.size > MAX_UPLOAD_SIZE
            if too_large and filename.operation_name == 'upload':
                warning = "Warning %s exceeds 5 TB and upload is " \
                            "being skipped" % relative_path(filename.src)
                self.result_queue.put({'message': warning, 'error': True})
            elif is_multipart_task and not self.params['dryrun']:
                # If we're in dryrun mode, then we don't need the
                # real multipart tasks.  We can just use a BasicTask
                # in the else clause below, which will print out the
                # fact that it's transferring a file rather than
                # the specific part tasks required to perform the
                # transfer.
                num_uploads = self._enqueue_multipart_tasks(filename)
            else:
                task = tasks.BasicTask(
                    session=self.session, filename=filename,
                    parameters=self.params,
                    result_queue=self.result_queue)
                self.executer.submit(task)
            total_files += 1
            total_parts += num_uploads
        return total_files, total_parts

    def _is_multipart_task(self, filename):
        # First we need to determine if it's an operation that even
        # qualifies for multipart upload.
        if hasattr(filename, 'size'):
            above_multipart_threshold = filename.size > self.multi_threshold
            if above_multipart_threshold:
                if filename.operation_name in ('upload', 'download',
                                               'move', 'copy'):
                    return True
                else:
                    return False
        else:
            return False

    def _enqueue_multipart_tasks(self, filename):
        num_uploads = 1
        if filename.operation_name == 'upload':
            num_uploads = self._enqueue_multipart_upload_tasks(filename)
        elif filename.operation_name == 'move':
            if filename.src_type == 'local' and filename.dest_type == 's3':
                num_uploads = self._enqueue_multipart_upload_tasks(
                    filename, remove_local_file=True)
            elif filename.src_type == 's3' and filename.dest_type == 'local':
                num_uploads = self._enqueue_range_download_tasks(
                    filename, remove_remote_file=True)
            elif filename.src_type == 's3' and filename.dest_type == 's3':
                num_uploads = self._enqueue_multipart_copy_tasks(
                    filename, remove_remote_file=True)
            else:
                raise ValueError("Unknown transfer type of %s -> %s" %
                                 (filename.src_type, filename.dest_type))
        elif filename.operation_name == 'copy':
            num_uploads = self._enqueue_multipart_copy_tasks(
                filename, remove_remote_file=False)
        elif filename.operation_name == 'download':
            num_uploads = self._enqueue_range_download_tasks(filename)
        return num_uploads

    def _enqueue_range_download_tasks(self, filename, remove_remote_file=False):
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_downloads = int(filename.size / chunksize)
        context = tasks.MultipartDownloadContext(num_downloads)
        create_file_task = tasks.CreateLocalFileTask(context=context,
                                                     filename=filename)
        self.executer.submit(create_file_task)
        for i in range(num_downloads):
            task = tasks.DownloadPartTask(
                part_number=i, chunk_size=chunksize,
                result_queue=self.result_queue, service=filename.service,
                filename=filename, context=context)
            self.executer.submit(task)
        complete_file_task = tasks.CompleteDownloadTask(
            context=context, filename=filename, result_queue=self.result_queue,
            params=self.params)
        self.executer.submit(complete_file_task)
        self._multipart_downloads.append((context, filename.dest))
        if remove_remote_file:
            remove_task = tasks.RemoveRemoteObjectTask(
                filename=filename, context=context)
            self.executer.submit(remove_task)
        return num_downloads

    def _enqueue_multipart_upload_tasks(self, filename,
                                        remove_local_file=False):
        # First we need to create a CreateMultipartUpload task,
        # then create UploadTask objects for each of the parts.
        # And finally enqueue a CompleteMultipartUploadTask.
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_uploads = int(math.ceil(filename.size /
                                    float(chunksize)))
        upload_context = self._enqueue_upload_start_task(
            chunksize, num_uploads, filename)
        self._enqueue_upload_tasks(
            num_uploads, chunksize, upload_context, filename, tasks.UploadPartTask)
        self._enqueue_upload_end_task(filename, upload_context)
        if remove_local_file:
            remove_task = tasks.RemoveFileTask(local_filename=filename.src,
                                               upload_context=upload_context)
            self.executer.submit(remove_task)
        return num_uploads

    def _enqueue_multipart_copy_tasks(self, filename,
                                      remove_remote_file=False):
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_uploads = int(math.ceil(filename.size / float(chunksize)))
        upload_context = self._enqueue_upload_start_task(
            chunksize, num_uploads, filename)
        self._enqueue_upload_tasks(
            num_uploads, chunksize, upload_context, filename, tasks.CopyPartTask)
        self._enqueue_upload_end_task(filename, upload_context)
        if remove_remote_file:
            remove_task = tasks.RemoveRemoteObjectTask(
                filename=filename, context=upload_context)
            self.executer.submit(remove_task)
        return num_uploads

    def _enqueue_upload_start_task(self, chunksize, num_uploads, filename):
        upload_context = tasks.MultipartUploadContext(
            expected_parts=num_uploads)
        create_multipart_upload_task = tasks.CreateMultipartUploadTask(
            session=self.session, filename=filename,
            parameters=self.params,
            result_queue=self.result_queue, upload_context=upload_context)
        self.executer.submit(create_multipart_upload_task)
        return upload_context

    def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context, filename,
                              task_class):
        for i in range(1, (num_uploads + 1)):
            task = task_class(
                part_number=i, chunk_size=chunksize,
                result_queue=self.result_queue, upload_context=upload_context,
                filename=filename)
            self.executer.submit(task)

    def _enqueue_upload_end_task(self, filename, upload_context):
        complete_multipart_upload_task = tasks.CompleteMultipartUploadTask(
            session=self.session, filename=filename, parameters=self.params,
            result_queue=self.result_queue, upload_context=upload_context)
        self.executer.submit(complete_multipart_upload_task)
        self._multipart_uploads.append((upload_context, filename))
Ejemplo n.º 8
0
class S3Handler(object):
    """
    This class sets up the process to perform the tasks sent to it.  It
    sources the ``self.executer`` from which threads inside the
    class pull tasks from to complete.
    """
    def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD,
                 chunksize=CHUNKSIZE):
        self.session = session
        self.done = threading.Event()
        self.interrupt = threading.Event()
        self.printQueue = NoBlockQueue()
        self.params = {'dryrun': False, 'quiet': False, 'acl': None}
        self.params['region'] = params['region']
        for key in self.params.keys():
            if key in params:
                self.params[key] = params[key]
        self.multi_threshold = multi_threshold
        self.chunksize = chunksize
        self.executer = Executer(done=self.done,
                                 num_threads=NUM_THREADS,
                                 timeout=QUEUE_TIMEOUT_GET,
                                 printQueue=self.printQueue,
                                 quiet=self.params['quiet'],
                                 interrupt=self.interrupt,
                                 max_multi=NUM_MULTI_THREADS)

    def call(self, files):
        """
        This function pulls a ``FileInfo`` or ``TaskInfo`` object from
        a list ``files``.  Each object is then deemed if it will be a
        multipart operation and add the necessary attributes if so.  Each
        object is then wrapped with a ``BasicTask`` object which is
        essentially a thread of execution for a thread to follow.  These
        tasks are then submitted to the main executer.
        """
        self.done.clear()
        self.interrupt.clear()
        try:
            self.executer.start()
            tot_files = 0
            tot_parts = 0
            for filename in files:
                num_uploads = 1
                is_larger = False
                chunksize = self.chunksize
                too_large = False
                if hasattr(filename, 'size'):
                    is_larger = filename.size > self.multi_threshold
                    too_large = filename.size > MAX_UPLOAD_SIZE
                if is_larger:
                    if filename.operation == 'upload':
                        num_uploads = int(math.ceil(filename.size /
                                                    float(chunksize)))
                        chunksize = find_chunksize(filename.size, chunksize)
                        filename.set_multi(executer=self.executer,
                                           printQueue=self.printQueue,
                                           interrupt=self.interrupt,
                                           chunksize=chunksize)
                    elif filename.operation == 'download':
                        num_uploads = int(filename.size / chunksize)
                        filename.set_multi(executer=self.executer,
                                           printQueue=self.printQueue,
                                           interrupt=self.interrupt,
                                           chunksize=chunksize)
                task = BasicTask(session=self.session, filename=filename,
                                 executer=self.executer, done=self.done,
                                 parameters=self.params,
                                 multi_threshold=self.multi_threshold,
                                 chunksize=chunksize,
                                 printQueue=self.printQueue,
                                 interrupt=self.interrupt)
                if too_large and filename.operation == 'upload':
                    warning = "Warning %s exceeds 5 TB and upload is " \
                              "being skipped" % os.path.relpath(filename.src)
                    self.printQueue.put({'result': warning})
                else:
                    self.executer.submit(task)
                tot_files += 1
                tot_parts += num_uploads
            self.executer.print_thread.totalFiles = tot_files
            self.executer.print_thread.totalParts = tot_parts
            self.executer.wait()
            self.printQueue.join()

        except Exception as e:
            LOGGER.debug('%s' % str(e))
        except KeyboardInterrupt:
            self.interrupt.set()
            self.printQueue.put({'result': "Cleaning up. Please wait..."})

        self.done.set()
        self.executer.join()
Ejemplo n.º 9
0
class Executor(object):
    """
    This class is in charge of all of the threads.  It starts up the threads
    and cleans up the threads when done.  The two type of threads the
    ``Executor``runs is a worker and a print thread.
    """
    def __init__(self, done, num_threads, result_queue, quiet, interrupt,
                 max_queue_size, write_queue):
        self.queue = None
        self.done = done
        self.num_threads = num_threads
        self.result_queue = result_queue
        self.quiet = quiet
        self.interrupt = interrupt
        self.threads_list = []
        self._max_queue_size = max_queue_size
        self.write_queue = write_queue
        self.print_thread = None
        self.io_thread = None

    @property
    def num_tasks_failed(self):
        tasks_failed = 0
        if self.print_thread is not None:
            tasks_failed = self.print_thread.num_errors_seen
        return tasks_failed

    def start(self):
        self.print_thread = PrintThread(self.result_queue, self.done,
                                        self.quiet, self.interrupt)
        self.print_thread.daemon = True
        self.io_thread = IOWriterThread(self.write_queue, self.done)
        self.io_thread.start()
        self.threads_list.append(self.io_thread)
        self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size)
        self.threads_list.append(self.print_thread)
        self.print_thread.start()
        for i in range(self.num_threads):
            worker = Worker(queue=self.queue, done=self.done)
            worker.setDaemon(True)
            self.threads_list.append(worker)
            worker.start()

    def submit(self, task):
        """
        This is the function used to submit a task to the ``Executor``.
        """
        LOGGER.debug("Submitting task: %s", task)
        self.queue.put(task)

    def wait(self):
        """
        This is the function used to wait on all of the tasks to finish
        in the ``Executor``.
        """
        self.queue.join()

    def join(self):
        """
        This is used to clean up the ``Executor``.
        """
        self.write_queue.put(QUEUE_END_SENTINEL)
        self.result_queue.put(QUEUE_END_SENTINEL)
        for i in range(self.num_threads):
            self.queue.put(QUEUE_END_SENTINEL)

        for thread in self.threads_list:
            thread.join()
Ejemplo n.º 10
0
 def multi_upload(self):
     """
     Performs multipart uploads.  It initiates the multipart upload.
     It creates a queue ``part_queue`` which is directly responsible
     with controlling the progress of the multipart upload.  It then
     creates ``UploadPartTasks`` for threads to run via the
     ``executer``.  This fucntion waits for all of the parts in the
     multipart upload to finish, and then it completes the multipart
     upload.  This method waits on its parts to finish.  So, threads
     are required to process the parts for this function to complete.
     """
     part_queue = NoBlockQueue(self.interrupt)
     complete_upload_queue = Queue.PriorityQueue()
     part_counter = MultiCounter()
     counter_lock = threading.Lock()
     bucket, key = find_bucket_key(self.dest)
     params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key}
     if self.parameters['acl']:
         params['acl'] = self.parameters['acl'][0]
     if self.parameters['guess_mime_type']:
         self._inject_content_type(params, self.src)
     response_data, http = operate(self.service, 'CreateMultipartUpload',
                                   params)
     upload_id = response_data['UploadId']
     size_uploads = self.chunksize
     num_uploads = int(math.ceil(self.size / float(size_uploads)))
     for i in range(1, (num_uploads + 1)):
         part_info = (self, upload_id, i, size_uploads)
         part_queue.put(part_info)
         task = UploadPartTask(session=self.session,
                               executer=self.executer,
                               part_queue=part_queue,
                               dest_queue=complete_upload_queue,
                               region=self.region,
                               printQueue=self.printQueue,
                               interrupt=self.interrupt,
                               part_counter=part_counter,
                               counter_lock=counter_lock)
         self.executer.submit(task)
     part_queue.join()
     # The following ensures that if the multipart upload is in progress,
     # all part uploads finish before aborting or completing.  This
     # really only applies when an interrupt signal is sent because the
     # ``part_queue.join()`` ensures this if the process is not
     # interrupted.
     while part_counter.count:
         time.sleep(0.1)
     parts_list = []
     while not complete_upload_queue.empty():
         part = complete_upload_queue.get()
         parts_list.append(part[1])
     if len(parts_list) == num_uploads:
         parts = {'Parts': parts_list}
         params = {
             'endpoint': self.endpoint,
             'bucket': bucket,
             'key': key,
             'upload_id': upload_id,
             'multipart_upload': parts
         }
         operate(self.service, 'CompleteMultipartUpload', params)
     else:
         abort_params = {
             'endpoint': self.endpoint,
             'bucket': bucket,
             'key': key,
             'upload_id': upload_id
         }
         operate(self.service, 'AbortMultipartUpload', abort_params)
         raise Exception()
Ejemplo n.º 11
0
class S3Handler(object):
    """
    This class sets up the process to perform the tasks sent to it.  It
    sources the ``self.executor`` from which threads inside the
    class pull tasks from to complete.
    """
    MAX_IO_QUEUE_SIZE = 20

    def __init__(self,
                 session,
                 params,
                 multi_threshold=MULTI_THRESHOLD,
                 chunksize=CHUNKSIZE):
        self.session = session
        self.done = threading.Event()
        self.interrupt = threading.Event()
        self.result_queue = NoBlockQueue()
        # The write_queue has potential for optimizations, so the constant
        # for maxsize is scoped to this class (as opposed to constants.py)
        # so we have the ability to change this value later.
        self.write_queue = NoBlockQueue(self.interrupt,
                                        maxsize=self.MAX_IO_QUEUE_SIZE)
        self.params = {
            'dryrun': False,
            'quiet': False,
            'acl': None,
            'guess_mime_type': True,
            'sse': False,
            'storage_class': None,
            'website_redirect': None,
            'content_type': None,
            'cache_control': None,
            'content_disposition': None,
            'content_encoding': None,
            'content_language': None,
            'expires': None,
            'grants': None
        }
        self.params['region'] = params['region']
        for key in self.params.keys():
            if key in params:
                self.params[key] = params[key]
        self.multi_threshold = multi_threshold
        self.chunksize = chunksize
        self.executor = Executor(done=self.done,
                                 num_threads=NUM_THREADS,
                                 result_queue=self.result_queue,
                                 quiet=self.params['quiet'],
                                 interrupt=self.interrupt,
                                 max_queue_size=MAX_QUEUE_SIZE,
                                 write_queue=self.write_queue)
        self._multipart_uploads = []
        self._multipart_downloads = []

    def call(self, files):
        """
        This function pulls a ``FileInfo`` or ``TaskInfo`` object from
        a list ``files``.  Each object is then deemed if it will be a
        multipart operation and add the necessary attributes if so.  Each
        object is then wrapped with a ``BasicTask`` object which is
        essentially a thread of execution for a thread to follow.  These
        tasks are then submitted to the main executor.
        """
        self.done.clear()
        self.interrupt.clear()
        try:
            self.executor.start()
            total_files, total_parts = self._enqueue_tasks(files)
            self.executor.print_thread.set_total_files(total_files)
            self.executor.print_thread.set_total_parts(total_parts)
            self.executor.wait()
            self.result_queue.join()

        except Exception as e:
            LOGGER.debug('Exception caught during task execution: %s',
                         str(e),
                         exc_info=True)
            self.result_queue.put({'message': str(e), 'error': True})
        except KeyboardInterrupt:
            self.interrupt.set()
            self.result_queue.put({
                'message': "Cleaning up. Please wait...",
                'error': False
            })
        self._shutdown()
        return self.executor.num_tasks_failed

    def _shutdown(self):
        # self.done will tell threads to shutdown.
        self.done.set()
        # This waill wait until all the threads are joined.
        self.executor.join()
        # And finally we need to make a pass through all the existing
        # multipart uploads and abort any pending multipart uploads.
        self._abort_pending_multipart_uploads()
        self._remove_pending_downloads()

    def _abort_pending_multipart_uploads(self):
        # For the purpose of aborting uploads, we consider any
        # upload context with an upload id.
        for upload, filename in self._multipart_uploads:
            if upload.is_cancelled():
                try:
                    upload.wait_for_upload_id()
                except tasks.UploadCancelledError:
                    pass
                else:
                    # This means that the upload went from STARTED -> CANCELLED.
                    # This could happen if a part thread decided to cancel the
                    # upload.  We need to explicitly abort the upload here.
                    self._cancel_upload(upload.wait_for_upload_id(), filename)
            upload.cancel_upload(self._cancel_upload, args=(filename, ))

    def _remove_pending_downloads(self):
        # The downloads case is easier than the uploads case because we don't
        # need to make any service calls.  To properly cleanup we just need
        # to go through the multipart downloads that were in progress but
        # cancelled and remove the local file.
        for context, local_filename in self._multipart_downloads:
            if (context.is_cancelled() or context.is_started()) and \
                    os.path.exists(local_filename):
                # The file is in an inconsistent state (not all the parts
                # were written to the file) so we should remove the
                # local file rather than leave it in a bad state.  We don't
                # want to remove the files if the download has *not* been
                # started because we haven't touched the file yet, so it's
                # better to leave the old version of the file rather than
                # deleting the file entirely.
                os.remove(local_filename)

    def _cancel_upload(self, upload_id, filename):
        bucket, key = find_bucket_key(filename.dest)
        params = {
            'bucket': bucket,
            'key': key,
            'upload_id': upload_id,
            'endpoint': filename.endpoint,
        }
        LOGGER.debug("Aborting multipart upload for: %s", key)
        response_data, http = operate(filename.service, 'AbortMultipartUpload',
                                      params)

    def _enqueue_tasks(self, files):
        total_files = 0
        total_parts = 0
        for filename in files:
            num_uploads = 1
            is_multipart_task = self._is_multipart_task(filename)
            too_large = False
            if hasattr(filename, 'size'):
                too_large = filename.size > MAX_UPLOAD_SIZE
            if too_large and filename.operation_name == 'upload':
                warning = "Warning %s exceeds 5 TB and upload is " \
                            "being skipped" % relative_path(filename.src)
                self.result_queue.put({'message': warning, 'error': True})
            elif is_multipart_task and not self.params['dryrun']:
                # If we're in dryrun mode, then we don't need the
                # real multipart tasks.  We can just use a BasicTask
                # in the else clause below, which will print out the
                # fact that it's transferring a file rather than
                # the specific part tasks required to perform the
                # transfer.
                num_uploads = self._enqueue_multipart_tasks(filename)
            else:
                task = tasks.BasicTask(session=self.session,
                                       filename=filename,
                                       parameters=self.params,
                                       result_queue=self.result_queue)
                self.executor.submit(task)
            total_files += 1
            total_parts += num_uploads
        return total_files, total_parts

    def _is_multipart_task(self, filename):
        # First we need to determine if it's an operation that even
        # qualifies for multipart upload.
        if hasattr(filename, 'size'):
            above_multipart_threshold = filename.size > self.multi_threshold
            if above_multipart_threshold:
                if filename.operation_name in ('upload', 'download', 'move',
                                               'copy'):
                    return True
                else:
                    return False
        else:
            return False

    def _enqueue_multipart_tasks(self, filename):
        num_uploads = 1
        if filename.operation_name == 'upload':
            num_uploads = self._enqueue_multipart_upload_tasks(filename)
        elif filename.operation_name == 'move':
            if filename.src_type == 'local' and filename.dest_type == 's3':
                num_uploads = self._enqueue_multipart_upload_tasks(
                    filename, remove_local_file=True)
            elif filename.src_type == 's3' and filename.dest_type == 'local':
                num_uploads = self._enqueue_range_download_tasks(
                    filename, remove_remote_file=True)
            elif filename.src_type == 's3' and filename.dest_type == 's3':
                num_uploads = self._enqueue_multipart_copy_tasks(
                    filename, remove_remote_file=True)
            else:
                raise ValueError("Unknown transfer type of %s -> %s" %
                                 (filename.src_type, filename.dest_type))
        elif filename.operation_name == 'copy':
            num_uploads = self._enqueue_multipart_copy_tasks(
                filename, remove_remote_file=False)
        elif filename.operation_name == 'download':
            num_uploads = self._enqueue_range_download_tasks(filename)
        return num_uploads

    def _enqueue_range_download_tasks(self,
                                      filename,
                                      remove_remote_file=False):
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_downloads = int(filename.size / chunksize)
        context = tasks.MultipartDownloadContext(num_downloads)
        create_file_task = tasks.CreateLocalFileTask(context=context,
                                                     filename=filename)
        self.executor.submit(create_file_task)
        for i in range(num_downloads):
            task = tasks.DownloadPartTask(part_number=i,
                                          chunk_size=chunksize,
                                          result_queue=self.result_queue,
                                          service=filename.service,
                                          filename=filename,
                                          context=context,
                                          io_queue=self.write_queue)
            self.executor.submit(task)
        complete_file_task = tasks.CompleteDownloadTask(
            context=context,
            filename=filename,
            result_queue=self.result_queue,
            params=self.params,
            io_queue=self.write_queue)
        self.executor.submit(complete_file_task)
        self._multipart_downloads.append((context, filename.dest))
        if remove_remote_file:
            remove_task = tasks.RemoveRemoteObjectTask(filename=filename,
                                                       context=context)
            self.executor.submit(remove_task)
        return num_downloads

    def _enqueue_multipart_upload_tasks(self,
                                        filename,
                                        remove_local_file=False):
        # First we need to create a CreateMultipartUpload task,
        # then create UploadTask objects for each of the parts.
        # And finally enqueue a CompleteMultipartUploadTask.
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_uploads = int(math.ceil(filename.size / float(chunksize)))
        upload_context = self._enqueue_upload_start_task(
            chunksize, num_uploads, filename)
        self._enqueue_upload_tasks(num_uploads, chunksize, upload_context,
                                   filename, tasks.UploadPartTask)
        self._enqueue_upload_end_task(filename, upload_context)
        if remove_local_file:
            remove_task = tasks.RemoveFileTask(local_filename=filename.src,
                                               upload_context=upload_context)
            self.executor.submit(remove_task)
        return num_uploads

    def _enqueue_multipart_copy_tasks(self,
                                      filename,
                                      remove_remote_file=False):
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_uploads = int(math.ceil(filename.size / float(chunksize)))
        upload_context = self._enqueue_upload_start_task(
            chunksize, num_uploads, filename)
        self._enqueue_upload_tasks(num_uploads, chunksize, upload_context,
                                   filename, tasks.CopyPartTask)
        self._enqueue_upload_end_task(filename, upload_context)
        if remove_remote_file:
            remove_task = tasks.RemoveRemoteObjectTask(filename=filename,
                                                       context=upload_context)
            self.executor.submit(remove_task)
        return num_uploads

    def _enqueue_upload_start_task(self, chunksize, num_uploads, filename):
        upload_context = tasks.MultipartUploadContext(
            expected_parts=num_uploads)
        create_multipart_upload_task = tasks.CreateMultipartUploadTask(
            session=self.session,
            filename=filename,
            parameters=self.params,
            result_queue=self.result_queue,
            upload_context=upload_context)
        self.executor.submit(create_multipart_upload_task)
        return upload_context

    def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context,
                              filename, task_class):
        for i in range(1, (num_uploads + 1)):
            task = task_class(part_number=i,
                              chunk_size=chunksize,
                              result_queue=self.result_queue,
                              upload_context=upload_context,
                              filename=filename)
            self.executor.submit(task)

    def _enqueue_upload_end_task(self, filename, upload_context):
        complete_multipart_upload_task = tasks.CompleteMultipartUploadTask(
            session=self.session,
            filename=filename,
            parameters=self.params,
            result_queue=self.result_queue,
            upload_context=upload_context)
        self.executor.submit(complete_multipart_upload_task)
        self._multipart_uploads.append((upload_context, filename))
Ejemplo n.º 12
0
class S3Handler(object):
    """
    This class sets up the process to perform the tasks sent to it.  It
    sources the ``self.executer`` from which threads inside the
    class pull tasks from to complete.
    """
    def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD,
                 chunksize=CHUNKSIZE):
        self.session = session
        self.done = threading.Event()
        self.interrupt = threading.Event()
        self.print_queue = NoBlockQueue()
        self.params = {'dryrun': False, 'quiet': False, 'acl': None,
                       'guess_mime_type': True, 'sse': False,
                       'storage_class': None, 'website_redirect': None,
                       'content_type': None, 'cache_control': None,
                       'content_disposition': None, 'content_encoding': None,
                       'content_language': None, 'expires': None,
                       'grants': None}
        self.params['region'] = params['region']
        for key in self.params.keys():
            if key in params:
                self.params[key] = params[key]
        self.multi_threshold = multi_threshold
        self.chunksize = chunksize
        self.executer = Executer(
            done=self.done, num_threads=NUM_THREADS,
            timeout=QUEUE_TIMEOUT_GET, print_queue=self.print_queue,
            quiet=self.params['quiet'], interrupt=self.interrupt,
            max_multi=NUM_MULTI_THREADS, max_queue_size=MAX_QUEUE_SIZE,
        )
        self._multipart_uploads = []

    def call(self, files):
        """
        This function pulls a ``FileInfo`` or ``TaskInfo`` object from
        a list ``files``.  Each object is then deemed if it will be a
        multipart operation and add the necessary attributes if so.  Each
        object is then wrapped with a ``BasicTask`` object which is
        essentially a thread of execution for a thread to follow.  These
        tasks are then submitted to the main executer.
        """
        self.done.clear()
        self.interrupt.clear()
        try:
            self.executer.start()
            total_files, total_parts = self._enqueue_tasks(files)
            self.executer.print_thread.set_total_files(total_files)
            self.executer.print_thread.set_total_parts(total_parts)
            self.executer.wait()
            self.print_queue.join()

        except Exception as e:
            LOGGER.debug('Exception caught during task execution: %s',
                         str(e), exc_info=True)
        except KeyboardInterrupt:
            self.interrupt.set()
            self.print_queue.put({'result': "Cleaning up. Please wait..."})
        self._shutdown()


    def _shutdown(self):
        # self.done will tell threads to shutdown.
        self.done.set()
        # This waill wait until all the threads are joined.
        self.executer.join()
        # And finally we need to make a pass through all the existing
        # multipart uploads and abort any pending multipart uploads.
        self._abort_pending_multipart_uploads()

    def _abort_pending_multipart_uploads(self):
        # For the purpose of aborting uploads, we consider any
        # upload context with an upload id.
        for upload, filename in self._multipart_uploads:
            if upload.is_cancelled():
                try:
                    upload_id = upload.wait_for_upload_id()
                except tasks.UploadCancelledError:
                    pass
                else:
                    # This means that the upload went from STARTED -> CANCELLED.
                    # This could happen if a part thread decided to cancel the
                    # upload.  We need to explicitly abort the upload here.
                    self._cancel_upload(upload.wait_for_upload_id(), filename)
            upload.cancel_upload(self._cancel_upload, args=(filename,))

    def _cancel_upload(self, upload_id, filename):
        bucket, key = find_bucket_key(filename.dest)
        params = {
            'bucket': bucket,
            'key': key,
            'upload_id': upload_id,
            'endpoint': filename.endpoint,
        }
        LOGGER.debug("Aborting multipart upload for: %s", key)
        response_data, http = operate(
            filename.service, 'AbortMultipartUpload', params)

    def _enqueue_tasks(self, files):
        total_files = 0
        total_parts = 0
        for filename in files:
            filename.set_session(self.session, self.params['region'])
            num_uploads = 1
            is_multipart_task = False
            too_large = False
            if hasattr(filename, 'size'):
                is_multipart_task = (
                    filename.size > self.multi_threshold and
                    filename.operation == 'upload')
                too_large = filename.size > MAX_UPLOAD_SIZE
            if too_large and filename.operation == 'upload':
                warning = "Warning %s exceeds 5 TB and upload is " \
                            "being skipped" % os.path.relpath(filename.src)
                self.print_queue.put({'result': warning})
            elif is_multipart_task:
                num_uploads = self._enqueue_multipart_tasks(filename)
            else:
                task = tasks.BasicTask(
                    session=self.session, filename=filename,
                    parameters=self.params,
                    print_queue=self.print_queue)
                self.executer.submit(task)
            total_files += 1
            total_parts += num_uploads
        return total_files, total_parts

    def _enqueue_multipart_tasks(self, filename):
        num_uploads = 1
        chunksize = self.chunksize
        if filename.operation == 'upload':
            num_uploads = self._enqueue_multipart_upload_tasks(filename)
        elif filename.operation == 'download':
            num_uploads = int(filename.size / chunksize)
            filename.set_multi(executer=self.executer,
                                print_queue=self.print_queue,
                                interrupt=self.interrupt,
                                chunksize=chunksize)
        return num_uploads

    def _enqueue_multipart_upload_tasks(self, filename):
        # First we need to create a CreateMultipartUpload task,
        # then create UploadTask objects for each of the parts.
        # And finally enqueue a CompleteMultipartUploadTask.
        chunksize = find_chunksize(filename.size, self.chunksize)
        num_uploads = int(math.ceil(filename.size /
                                    float(chunksize)))
        upload_context = tasks.MultipartUploadContext(
            expected_parts=num_uploads)
        create_multipart_upload_task = tasks.CreateMultipartUploadTask(
            session=self.session, filename=filename,
            parameters=self.params,
            print_queue=self.print_queue, upload_context=upload_context)
        self.executer.submit(create_multipart_upload_task)

        for i in range(1, (num_uploads + 1)):
            task = tasks.UploadPartTask(
                part_number=i, chunk_size=chunksize,
                print_queue=self.print_queue, upload_context=upload_context,
                filename=filename)
            self.executer.submit(task)

        complete_multipart_upload_task = tasks.CompleteMultipartUploadTask(
            session=self.session, filename=filename, parameters=self.params,
            print_queue=self.print_queue, upload_context=upload_context)
        self.executer.submit(complete_multipart_upload_task)
        self._multipart_uploads.append((upload_context, filename))
        return num_uploads