Esempio n. 1
0
    def test_n_splits(self):
        """
        Test that n_splits works as expected.
        """
        for n_splits, expected_n_chunks in [(1, 1), (3, 3), (150, 100)]:
            with self.subTest(n_splits=n_splits):
                iterable_of_args, iterable_len, chunk_size, n_splits_ = apply_numpy_chunking(
                    self.test_data_numpy, n_splits=n_splits)

                # Materialize generator and test contents. We simply test if every row of the original input occurs in
                # the chunks
                iterable_of_args = list(iterable_of_args)
                self.assertEqual(len(iterable_of_args), expected_n_chunks)
                offset = 0
                for chunk in iterable_of_args:
                    self.assertIsInstance(chunk[0], np.ndarray)
                    np.testing.assert_array_equal(
                        chunk[0],
                        self.test_data_numpy[offset:offset + len(chunk[0])])
                    offset += len(chunk[0])
                self.assertEqual(offset, 100)

                # Test other output
                self.assertEqual(iterable_len, expected_n_chunks)
                self.assertEqual(chunk_size, 1)
                self.assertIsNone(n_splits_)

        # chunk_size and n_splits can't be both None
        with self.subTest(n_splits=None), self.assertRaises(ValueError):
            iterable_of_args, *_ = apply_numpy_chunking(self.test_data_numpy,
                                                        n_splits=None)
            list(iterable_of_args)
Esempio n. 2
0
    def test_n_jobs(self):
        """
        Test that n_jobs works as expected. When chunk_size and n_splits are both None, n_jobs * 4 is passed on as
        n_splits
        """
        for n_jobs, expected_n_chunks in [(1, 4), (3, 12), (40, 100),
                                          (150, 100)]:
            with self.subTest(n_jobs=n_jobs):
                iterable_of_args, iterable_len, chunk_size, n_splits_ = apply_numpy_chunking(
                    self.test_data_numpy, n_jobs=n_jobs)

                # Materialize generator and test contents. We simply test if every row of the original input occurs in
                # the chunks
                iterable_of_args = list(iterable_of_args)
                self.assertEqual(len(iterable_of_args), expected_n_chunks)
                offset = 0
                for chunk in iterable_of_args:
                    self.assertIsInstance(chunk[0], np.ndarray)
                    np.testing.assert_array_equal(
                        chunk[0],
                        self.test_data_numpy[offset:offset + len(chunk[0])])
                    offset += len(chunk[0])
                self.assertEqual(offset, 100)

                # Test other output
                self.assertEqual(iterable_len, expected_n_chunks)
                self.assertEqual(chunk_size, 1)
                self.assertIsNone(n_splits_)
Esempio n. 3
0
    def test_chunk_size(self):
        """
        Test that chunk_size works as expected. Note that chunk_size trumps n_splits
        """
        for chunk_size, expected_n_chunks in [(1, 100), (3, 34), (200, 1),
                                              (None, 1)]:
            with self.subTest(chunk_size=chunk_size):
                iterable_of_args, iterable_len, chunk_size_, n_splits = apply_numpy_chunking(
                    self.test_data_numpy, chunk_size=chunk_size, n_splits=1)

                # Materialize generator and test contents. The chunks should be of size chunk_size (expect for the last
                # chunk which can be smaller)
                iterable_of_args = list(iterable_of_args)
                self.assertEqual(len(iterable_of_args), expected_n_chunks)
                chunk_size = chunk_size or 100
                for chunk_idx, chunk in enumerate(iterable_of_args):
                    self.assertIsInstance(chunk[0], np.ndarray)
                    np.testing.assert_array_equal(
                        chunk[0],
                        self.test_data_numpy[chunk_idx *
                                             chunk_size:(chunk_idx + 1) *
                                             chunk_size])

                # Test other output
                self.assertEqual(iterable_len, expected_n_chunks)
                self.assertEqual(chunk_size_, 1)
                self.assertIsNone(n_splits)
Esempio n. 4
0
    def test_iterable_len(self):
        """
        Test that iterable_len is adhered to. When iterable_len < len(input) it should reduce the input size. If higher
        or None it should take the entire input
        """
        for iterable_len, expected_size in [(5, 5), (150, 100), (None, 100)]:
            with self.subTest(iterable_len=iterable_len):
                iterable_of_args, iterable_len_, chunk_size, n_splits = apply_numpy_chunking(
                    self.test_data_numpy,
                    iterable_len=iterable_len,
                    n_splits=1)

                # Materialize generator and test contents
                iterable_of_args = list(iterable_of_args)
                self.assertEqual(len(iterable_of_args), 1)
                self.assertIsInstance(iterable_of_args[0][0], np.ndarray)
                np.testing.assert_array_equal(
                    iterable_of_args[0][0],
                    self.test_data_numpy[:expected_size])

                # Test other output
                self.assertEqual(iterable_len_, 1)
                self.assertEqual(chunk_size, 1)
                self.assertIsNone(n_splits)
Esempio n. 5
0
    def imap_unordered(
            self,
            func_pointer: Callable,
            iterable_of_args: Union[Iterable, np.ndarray],
            iterable_len: Optional[int] = None,
            max_tasks_active: Optional[int] = None,
            chunk_size: Optional[int] = None,
            n_splits: Optional[int] = None,
            worker_lifespan: Optional[int] = None,
            progress_bar: bool = False,
            progress_bar_position: int = 0) -> Generator[Any, None, None]:
        """
        Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in
        the queue.

        :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the
            worker ID the function should receive the worker ID as its first argument. If shared objects are provided
            the function should receive those as the next argument. If the worker state has been enabled it should
            receive a state variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function pointer
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue
        :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4``
            number of chunks
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``
        :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :return: Generator yielding unordered results
        """
        # If we're dealing with numpy arrays, we have to chunk them here already
        if isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.n_jobs)

        # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be
        # modified as well
        n_tasks, chunk_size, progress_bar = self._check_map_parameters(
            iterable_of_args, iterable_len, max_tasks_active, chunk_size,
            n_splits, worker_lifespan, progress_bar, progress_bar_position)

        # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays
        if not isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args = chunk_tasks(iterable_of_args, n_tasks,
                                                   chunk_size, n_splits
                                                   or self.n_jobs * 4)

        # Start workers
        self.start_workers(func_pointer, worker_lifespan)

        # Create exception and progress bar handlers. The exception handler will receive any exceptions thrown by the
        # workers, terminates everything and re-raise an exceptoin in the main process. The progress bar handler will
        # receive updates from the workers and updates the progress bar accordingly
        with ExceptionHandler(self.terminate, self._exception_queue, self.exception_caught, self._keep_order,
                              progress_bar is not None) as exception_handler, \
             ProgressBarHandler(func_pointer, progress_bar, n_tasks, progress_bar_position, self._task_completed_queue,
                                self._exception_queue, self.exception_caught):

            # Process all args in the iterable. If maximum number of active tasks is None, we avoid all the if and
            # try-except clauses to speed up the process.
            n_active = 0
            if max_tasks_active == 'n_jobs*2':
                max_tasks_active = self.n_jobs * 2

            if max_tasks_active is None:
                for chunked_args in iterator_of_chunked_args:
                    # Stop given tasks when an exception was caught
                    if self.exception_caught.is_set():
                        break

                    # Add task
                    self.add_task(chunked_args)
                    n_active += 1

                    # Restart workers if necessary
                    self._restart_workers()

            elif isinstance(max_tasks_active, int):
                while not self.exception_caught.is_set():
                    # Add task, only if allowed and if there are any
                    if n_active < max_tasks_active:
                        try:
                            self.add_task(next(iterator_of_chunked_args))
                            n_active += 1
                        except StopIteration:
                            break

                    # Check if new results are available, but don't wait for it
                    try:
                        yield from self.get_result(block=False)
                        n_active -= 1
                    except queue.Empty:
                        pass

                    # Restart workers if necessary
                    self._restart_workers()

            # Obtain the results not yet obtained
            while not self.exception_caught.is_set() and n_active != 0:
                try:
                    yield from self.get_result(block=True, timeout=0.1)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Clean up time
            exception_handler.raise_on_exception()
            self.stop_and_join()
Esempio n. 6
0
    def imap(self,
             func_pointer: Callable,
             iterable_of_args: Union[Iterable, np.ndarray],
             iterable_len: Optional[int] = None,
             max_tasks_active: Optional[int] = None,
             chunk_size: Optional[int] = None,
             n_splits: Optional[int] = None,
             worker_lifespan: Optional[int] = None,
             progress_bar: bool = False,
             progress_bar_position: int = 0) -> Generator[Any, None, None]:
        """
        Same as ``multiprocessing.imap_unordered()``, but ordered. Also allows a user to set the maximum number of
        tasks available in the queue.

        :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the
            worker ID the function should receive the worker ID as its first argument. If shared objects are provided
            the function should receive those as the next argument. If the worker state has been enabled it should
            receive a state variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function pointer
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue
        :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4``
            number of chunks
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``
        :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :return: Generator yielding ordered results
        """
        # Notify workers to keep order in mind
        self._keep_order.set()

        # If we're dealing with numpy arrays, we have to chunk them here already
        if isinstance(iterable_of_args, np.ndarray):
            iterable_of_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.n_jobs)

        # Yield results in order
        next_result_idx = 0
        tmp_results = {}
        if iterable_len is None and hasattr(iterable_of_args, '__len__'):
            iterable_len = len(iterable_of_args)
        for result_idx, result in self.imap_unordered(
                func_pointer,
            ((args_idx, args)
             for args_idx, args in enumerate(iterable_of_args)), iterable_len,
                max_tasks_active, chunk_size, n_splits, worker_lifespan,
                progress_bar, progress_bar_position):

            # Check if the next one(s) to return is/are temporarily stored. We use a while-true block with dict.pop() to
            # keep the temporary store as small as possible
            while True:
                if next_result_idx in tmp_results:
                    yield tmp_results.pop(next_result_idx)
                    next_result_idx += 1
                else:
                    break

            # Check if the current result is the next one to return. If so, return it
            if result_idx == next_result_idx:
                yield result
                next_result_idx += 1
            # Otherwise, temporarily store the current result
            else:
                tmp_results[result_idx] = result

        # Yield all remaining results
        for result_idx in sorted(tmp_results.keys()):
            yield tmp_results.pop(result_idx)

        # Notify workers to forget about order
        self._keep_order.clear()
Esempio n. 7
0
    def map(self,
            func_pointer: Callable,
            iterable_of_args: Union[Iterable, np.ndarray],
            iterable_len: Optional[int] = None,
            max_tasks_active: Optional[int] = None,
            chunk_size: Optional[int] = None,
            n_splits: Optional[int] = None,
            worker_lifespan: Optional[int] = None,
            progress_bar: bool = False,
            progress_bar_position: int = 0,
            concatenate_numpy_output: bool = True
            ) -> Union[List[Any], np.ndarray]:
        """
        Same as ``multiprocessing.map()``. Also allows a user to set the maximum number of tasks available in the queue.
        Note that this function can be slower than the unordered version.

        :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the
            worker ID the function should receive the worker ID as its first argument. If shared objects are provided
            the function should receive those as the next argument. If the worker state has been enabled it should
            receive a state variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function pointer
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue
        :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4``
            number of chunks
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``
        :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :param concatenate_numpy_output: When ``True`` it will concatenate numpy output to a single numpy array
        :return: List with ordered results
        """
        # Notify workers to keep order in mind
        self._keep_order.set()

        # If we're dealing with numpy arrays, we have to chunk them here already
        if isinstance(iterable_of_args, np.ndarray):
            iterable_of_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.n_jobs)

        # Process all args
        if iterable_len is None and hasattr(iterable_of_args, '__len__'):
            iterable_len = len(iterable_of_args)
        results = self.map_unordered(
            func_pointer, ((args_idx, args)
                           for args_idx, args in enumerate(iterable_of_args)),
            iterable_len, max_tasks_active, chunk_size, n_splits,
            worker_lifespan, progress_bar, progress_bar_position)

        # Notify workers to forget about order
        self._keep_order.clear()

        # Rearrange and return
        sorted_results = [
            result[1]
            for result in sorted(results, key=lambda result: result[0])
        ]

        # Convert back to numpy if necessary
        return (np.concatenate(sorted_results)
                if sorted_results and concatenate_numpy_output and isinstance(
                    sorted_results[0], np.ndarray) else sorted_results)
Esempio n. 8
0
    def imap_unordered(
            self,
            func: Callable,
            iterable_of_args: Union[Sized, Iterable, np.ndarray],
            iterable_len: Optional[int] = None,
            max_tasks_active: Optional[int] = None,
            chunk_size: Optional[int] = None,
            n_splits: Optional[int] = None,
            worker_lifespan: Optional[int] = None,
            progress_bar: bool = False,
            progress_bar_position: int = 0,
            enable_insights: bool = False,
            worker_init: Optional[Callable] = None,
            worker_exit: Optional[Callable] = None
    ) -> Generator[Any, None, None]:
        """
        Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in
        the queue.

        :param func: Function to call each time new task arguments become available. When passing on the worker ID the
            function should receive the worker ID as its first argument. If shared objects are provided the function
            should receive those as the next argument. If the worker state has been enabled it should receive a state
            variable as the next argument
        :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which
            passes it to the function ``func``
        :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs
            to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the
            iterable object, or by specifying the number of tasks
        :param max_tasks_active: Maximum number of active tasks in the queue. If ``None`` it will be converted to
            ``n_jobs * 2``
        :param chunk_size: Number of simultaneous tasks to give to a worker. When ``None`` it will use ``n_splits``.
        :param n_splits: Number of splits to use when ``chunk_size`` is ``None``. When both ``chunk_size`` and
            ``n_splits`` are ``None``, it will use ``n_splits = n_jobs * 64``.
        :param worker_lifespan: Number of tasks a worker can handle before it is restarted. If ``None``, workers will
            stay alive the entire time. Use this when workers use up too much memory over the course of time
        :param progress_bar: When ``True`` it will display a progress bar
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :param enable_insights: Whether to enable worker insights. Might come at a small performance penalty (often
            neglible)
        :param worker_init: Function to call each time a new worker starts. When passing on the worker ID the function
            should receive the worker ID as its first argument. If shared objects are provided the function should
            receive those as the next argument. If the worker state has been enabled it should receive a state variable
            as the next argument
        :param worker_exit: Function to call each time a worker exits. Return values will be fetched and made available
            through :obj:`mpire.WorkerPool.get_exit_results`. When passing on the worker ID the function should receive
            the worker ID as its first argument. If shared objects are provided the function should receive those as the
            next argument. If the worker state has been enabled it should receive a state variable as the next argument
        :return: Generator yielding unordered results
        """
        # If we're dealing with numpy arrays, we have to chunk them here already
        iterator_of_chunked_args = []
        if isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking(
                iterable_of_args, iterable_len, chunk_size, n_splits,
                self.params.n_jobs)

        # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be
        # modified as well
        n_tasks, max_tasks_active, chunk_size, progress_bar = self.params.check_map_parameters(
            iterable_of_args, iterable_len, max_tasks_active, chunk_size,
            n_splits, worker_lifespan, progress_bar, progress_bar_position)

        # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays
        if not isinstance(iterable_of_args, np.ndarray):
            iterator_of_chunked_args = chunk_tasks(
                iterable_of_args, n_tasks, chunk_size, n_splits
                or self.params.n_jobs * 64)

        # Reset profiling stats
        self._worker_insights.reset_insights(enable_insights)

        # Start workers if there aren't any. If they already exist they must be restarted when either the function to
        # execute or the worker lifespan changes
        if self._workers and self.params.workers_need_restart(
                func, worker_init, worker_exit, worker_lifespan):
            self.stop_and_join()
        if not self._workers:
            logger.debug("Spinning up workers")
            self._start_workers(func, worker_init, worker_exit,
                                worker_lifespan, bool(progress_bar))

        # Create exception, exit results, and progress bar handlers. The exception handler receives any exceptions
        # thrown by the workers, terminates everything and re-raise an exception in the main process. The exit results
        # handler fetches results from the exit function, if provided. The progress bar handler receives progress
        # updates from the workers and updates the progress bar accordingly
        with ExceptionHandler(self.terminate, self._worker_comms, bool(progress_bar)) as exception_handler, \
             ProgressBarHandler(func, self.params.n_jobs, progress_bar, n_tasks, progress_bar_position,
                                self._worker_comms, self._worker_insights):

            # Process all args in the iterable
            n_active = 0
            while not self._worker_comms.exception_caught():
                # Add task, only if allowed and if there are any
                if n_active < max_tasks_active:
                    try:
                        self._worker_comms.add_task(
                            next(iterator_of_chunked_args))
                        n_active += 1
                    except StopIteration:
                        break

                # Check if new results are available, but don't wait for it
                try:
                    yield from self._worker_comms.get_results(block=False)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Obtain the results not yet obtained
            while not self._worker_comms.exception_caught() and n_active != 0:
                try:
                    yield from self._worker_comms.get_results(block=True,
                                                              timeout=0.1)
                    n_active -= 1
                except queue.Empty:
                    pass

                # Restart workers if necessary
                self._restart_workers()

            # Clean up time. When keep_alive is set to True we won't join the workers. During the stop_and_join call an
            # error can occur as well, so we have to check once again whether an exception occurred and raise if it did
            exception_handler.raise_on_exception()
            if not self.params.keep_alive:
                self.stop_and_join()
                exception_handler.raise_on_exception()

        # Log insights
        if enable_insights:
            logger.debug(self._worker_insights.get_insights_string())