Beispiel #1
0
    def test_empty_input(self):
        """
        Test that the chunker is an empty generator for an empty input iterable
        """
        with self.subTest('list input'):
            chunks = list(chunk_tasks([], n_splits=5))
            self.assertEqual(len(chunks), 0)

        with self.subTest('generator/iterator input'):
            chunks = list(chunk_tasks(iter([]), iterable_len=0, n_splits=5))
            self.assertEqual(len(chunks), 0)
Beispiel #2
0
 def test_chunk_size_has_priority_over_n_splits(self):
     """
     Test that chunk_size is prioritized over n_splits
     """
     chunks = list(chunk_tasks(range(4), chunk_size=4, n_splits=4))
     self.assertEqual(len(chunks), 1)
     self.assertEqual(len(chunks[0]), 4)
     self.assertEqual(list(range(4)), list(chain.from_iterable(chunks)))
Beispiel #3
0
    def test_n_splits(self):
        """
        Test different values of n_splits: len(args) {<, ==, >} n_splits
        """
        n_splits = 5
        for num_args in [n_splits - 1, n_splits, n_splits + 1]:
            expected_n_chunks = min(n_splits, num_args)

            # Test for normal list (range is considered a normal list as it implements __len__ and such)
            with self.subTest(num_args=num_args, input='list'):
                chunks = list(chunk_tasks(range(num_args), n_splits=n_splits))
                self.assertEqual(len(chunks), expected_n_chunks)
                self.assertEqual(list(range(num_args)),
                                 list(chain.from_iterable(chunks)))

            # Test for an actual generator (range does not really behave like one)
            with self.subTest(num_args=num_args, input='generator/iterator'):
                chunks = list(
                    chunk_tasks(iter(range(num_args)),
                                iterable_len=num_args,
                                n_splits=n_splits))
                self.assertEqual(len(chunks), expected_n_chunks)
                self.assertEqual(list(range(num_args)),
                                 list(chain.from_iterable(chunks)))
Beispiel #4
0
 def test_generator_without_iterable_len(self):
     """
     Test that a ValueError is raised when a generator is provided without iterable_len
     """
     with self.assertRaises(ValueError):
         next(chunk_tasks(iter([]), n_splits=1))
Beispiel #5
0
 def test_no_chunk_size_no_n_splits_provided(self):
     """
     Test that a ValueError is raised when no chunk_size and n_splits are provided
     """
     with self.assertRaises(ValueError):
         next(chunk_tasks([]))
Beispiel #6
0
    def imap_unordered(
            self,
            func_pointer: Callable,
            iterable_of_args: Union[Iterable, np.ndarray],
            iterable_len: Optional[int] = None,
            max_tasks_active: Optional[int] = None,
            chunk_size: Optional[int] = None,
            n_splits: Optional[int] = None,
            worker_lifespan: Optional[int] = None,
            progress_bar: bool = False,
            progress_bar_position: int = 0) -> Generator[Any, None, None]:
        """
        Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in
        the queue.

        :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the
            worker ID the function should receive the worker ID as its first argument. If shared objects are provided
            the function should receive those as the next argument. If the worker state has been enabled it should
            receive a state variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function pointer
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue
        :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4``
            number of chunks
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``
        :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :return: Generator yielding unordered results
        """
        # If we're dealing with numpy arrays, we have to chunk them here already
        if isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.n_jobs)

        # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be
        # modified as well
        n_tasks, chunk_size, progress_bar = self._check_map_parameters(
            iterable_of_args, iterable_len, max_tasks_active, chunk_size,
            n_splits, worker_lifespan, progress_bar, progress_bar_position)

        # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays
        if not isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args = chunk_tasks(iterable_of_args, n_tasks,
                                                   chunk_size, n_splits
                                                   or self.n_jobs * 4)

        # Start workers
        self.start_workers(func_pointer, worker_lifespan)

        # Create exception and progress bar handlers. The exception handler will receive any exceptions thrown by the
        # workers, terminates everything and re-raise an exceptoin in the main process. The progress bar handler will
        # receive updates from the workers and updates the progress bar accordingly
        with ExceptionHandler(self.terminate, self._exception_queue, self.exception_caught, self._keep_order,
                              progress_bar is not None) as exception_handler, \
             ProgressBarHandler(func_pointer, progress_bar, n_tasks, progress_bar_position, self._task_completed_queue,
                                self._exception_queue, self.exception_caught):

            # Process all args in the iterable. If maximum number of active tasks is None, we avoid all the if and
            # try-except clauses to speed up the process.
            n_active = 0
            if max_tasks_active == 'n_jobs*2':
                max_tasks_active = self.n_jobs * 2

            if max_tasks_active is None:
                for chunked_args in iterator_of_chunked_args:
                    # Stop given tasks when an exception was caught
                    if self.exception_caught.is_set():
                        break

                    # Add task
                    self.add_task(chunked_args)
                    n_active += 1

                    # Restart workers if necessary
                    self._restart_workers()

            elif isinstance(max_tasks_active, int):
                while not self.exception_caught.is_set():
                    # Add task, only if allowed and if there are any
                    if n_active < max_tasks_active:
                        try:
                            self.add_task(next(iterator_of_chunked_args))
                            n_active += 1
                        except StopIteration:
                            break

                    # Check if new results are available, but don't wait for it
                    try:
                        yield from self.get_result(block=False)
                        n_active -= 1
                    except queue.Empty:
                        pass

                    # Restart workers if necessary
                    self._restart_workers()

            # Obtain the results not yet obtained
            while not self.exception_caught.is_set() and n_active != 0:
                try:
                    yield from self.get_result(block=True, timeout=0.1)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Clean up time
            exception_handler.raise_on_exception()
            self.stop_and_join()
Beispiel #7
0
    def imap_unordered(
            self,
            func: Callable,
            iterable_of_args: Union[Sized, Iterable, np.ndarray],
            iterable_len: Optional[int] = None,
            max_tasks_active: Optional[int] = None,
            chunk_size: Optional[int] = None,
            n_splits: Optional[int] = None,
            worker_lifespan: Optional[int] = None,
            progress_bar: bool = False,
            progress_bar_position: int = 0,
            enable_insights: bool = False,
            worker_init: Optional[Callable] = None,
            worker_exit: Optional[Callable] = None
    ) -> Generator[Any, None, None]:
        """
        Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in
        the queue.

        :param func: Function to call each time new task arguments become available. When passing on the worker ID the
            function should receive the worker ID as its first argument. If shared objects are provided the function
            should receive those as the next argument. If the worker state has been enabled it should receive a state
            variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function ``func``
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. If ``None`` it will be converted to
            ``n_jobs * 2``
        :param chunk_size: Number of simultaneous tasks to give to a worker. When ``None`` it will use ``n_splits``.
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``. When both ``chunk_size`` and
            ``n_splits`` are ``None``, it will use ``n_splits = n_jobs * 64``.
        :param worker_lifespan: Number of tasks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :param enable_insights: Whether to enable worker insights. Might come at a small performance penalty (often
            neglible)
        :param worker_init: Function to call each time a new worker starts. When passing on the worker ID the function
            should receive the worker ID as its first argument. If shared objects are provided the function should
            receive those as the next argument. If the worker state has been enabled it should receive a state variable
            as the next argument
        :param worker_exit: Function to call each time a worker exits. Return values will be fetched and made available
            through :obj:`mpire.WorkerPool.get_exit_results`. When passing on the worker ID the function should receive
            the worker ID as its first argument. If shared objects are provided the function should receive those as the
            next argument. If the worker state has been enabled it should receive a state variable as the next argument
        :return: Generator yielding unordered results
        """
        # If we're dealing with numpy arrays, we have to chunk them here already
        iterator_of_chunked_args = []
        if isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.params.n_jobs)

        # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be
        # modified as well
        n_tasks, max_tasks_active, chunk_size, progress_bar = self.params.check_map_parameters(
            iterable_of_args, iterable_len, max_tasks_active, chunk_size,
            n_splits, worker_lifespan, progress_bar, progress_bar_position)

        # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays
        if not isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args = chunk_tasks(
                iterable_of_args, n_tasks, chunk_size, n_splits
                or self.params.n_jobs * 64)

        # Reset profiling stats
        self._worker_insights.reset_insights(enable_insights)

        # Start workers if there aren't any. If they already exist they must be restarted when either the function to
        # execute or the worker lifespan changes
        if self._workers and self.params.workers_need_restart(
                func, worker_init, worker_exit, worker_lifespan):
            self.stop_and_join()
        if not self._workers:
            logger.debug("Spinning up workers")
            self._start_workers(func, worker_init, worker_exit,
                                worker_lifespan, bool(progress_bar))

        # Create exception, exit results, and progress bar handlers. The exception handler receives any exceptions
        # thrown by the workers, terminates everything and re-raise an exception in the main process. The exit results
        # handler fetches results from the exit function, if provided. The progress bar handler receives progress
        # updates from the workers and updates the progress bar accordingly
        with ExceptionHandler(self.terminate, self._worker_comms, bool(progress_bar)) as exception_handler, \
             ProgressBarHandler(func, self.params.n_jobs, progress_bar, n_tasks, progress_bar_position,
                                self._worker_comms, self._worker_insights):

            # Process all args in the iterable
            n_active = 0
            while not self._worker_comms.exception_caught():
                # Add task, only if allowed and if there are any
                if n_active < max_tasks_active:
                    try:
                        self._worker_comms.add_task(
                            next(iterator_of_chunked_args))
                        n_active += 1
                    except StopIteration:
                        break

                # Check if new results are available, but don't wait for it
                try:
                    yield from self._worker_comms.get_results(block=False)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Obtain the results not yet obtained
            while not self._worker_comms.exception_caught() and n_active != 0:
                try:
                    yield from self._worker_comms.get_results(block=True,
                                                              timeout=0.1)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Clean up time. When keep_alive is set to True we won't join the workers. During the stop_and_join call an
            # error can occur as well, so we have to check once again whether an exception occurred and raise if it did
            exception_handler.raise_on_exception()
            if not self.params.keep_alive:
                self.stop_and_join()
                exception_handler.raise_on_exception()

        # Log insights
        if enable_insights:
            logger.debug(self._worker_insights.get_insights_string())