def test_empty_input(self): """ Test that the chunker is an empty generator for an empty input iterable """ with self.subTest('list input'): chunks = list(chunk_tasks([], n_splits=5)) self.assertEqual(len(chunks), 0) with self.subTest('generator/iterator input'): chunks = list(chunk_tasks(iter([]), iterable_len=0, n_splits=5)) self.assertEqual(len(chunks), 0)
def test_chunk_size_has_priority_over_n_splits(self): """ Test that chunk_size is prioritized over n_splits """ chunks = list(chunk_tasks(range(4), chunk_size=4, n_splits=4)) self.assertEqual(len(chunks), 1) self.assertEqual(len(chunks[0]), 4) self.assertEqual(list(range(4)), list(chain.from_iterable(chunks)))
def test_n_splits(self): """ Test different values of n_splits: len(args) {<, ==, >} n_splits """ n_splits = 5 for num_args in [n_splits - 1, n_splits, n_splits + 1]: expected_n_chunks = min(n_splits, num_args) # Test for normal list (range is considered a normal list as it implements __len__ and such) with self.subTest(num_args=num_args, input='list'): chunks = list(chunk_tasks(range(num_args), n_splits=n_splits)) self.assertEqual(len(chunks), expected_n_chunks) self.assertEqual(list(range(num_args)), list(chain.from_iterable(chunks))) # Test for an actual generator (range does not really behave like one) with self.subTest(num_args=num_args, input='generator/iterator'): chunks = list( chunk_tasks(iter(range(num_args)), iterable_len=num_args, n_splits=n_splits)) self.assertEqual(len(chunks), expected_n_chunks) self.assertEqual(list(range(num_args)), list(chain.from_iterable(chunks)))
def test_generator_without_iterable_len(self): """ Test that a ValueError is raised when a generator is provided without iterable_len """ with self.assertRaises(ValueError): next(chunk_tasks(iter([]), n_splits=1))
def test_no_chunk_size_no_n_splits_provided(self): """ Test that a ValueError is raised when no chunk_size and n_splits are provided """ with self.assertRaises(ValueError): next(chunk_tasks([]))
def imap_unordered( self, func_pointer: Callable, iterable_of_args: Union[Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0) -> Generator[Any, None, None]: """ Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in the queue. :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function pointer :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4`` number of chunks :param n_splits: Number of splits to use when ``chunk_size`` is ``None`` :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :return: Generator yielding unordered results """ # If we're dealing with numpy arrays, we have to chunk them here already if isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.n_jobs) # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be # modified as well n_tasks, chunk_size, progress_bar = self._check_map_parameters( iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position) # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays if not isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args = chunk_tasks(iterable_of_args, n_tasks, chunk_size, n_splits or self.n_jobs * 4) # Start workers self.start_workers(func_pointer, worker_lifespan) # Create exception and progress bar handlers. The exception handler will receive any exceptions thrown by the # workers, terminates everything and re-raise an exceptoin in the main process. The progress bar handler will # receive updates from the workers and updates the progress bar accordingly with ExceptionHandler(self.terminate, self._exception_queue, self.exception_caught, self._keep_order, progress_bar is not None) as exception_handler, \ ProgressBarHandler(func_pointer, progress_bar, n_tasks, progress_bar_position, self._task_completed_queue, self._exception_queue, self.exception_caught): # Process all args in the iterable. If maximum number of active tasks is None, we avoid all the if and # try-except clauses to speed up the process. n_active = 0 if max_tasks_active == 'n_jobs*2': max_tasks_active = self.n_jobs * 2 if max_tasks_active is None: for chunked_args in iterator_of_chunked_args: # Stop given tasks when an exception was caught if self.exception_caught.is_set(): break # Add task self.add_task(chunked_args) n_active += 1 # Restart workers if necessary self._restart_workers() elif isinstance(max_tasks_active, int): while not self.exception_caught.is_set(): # Add task, only if allowed and if there are any if n_active < max_tasks_active: try: self.add_task(next(iterator_of_chunked_args)) n_active += 1 except StopIteration: break # Check if new results are available, but don't wait for it try: yield from self.get_result(block=False) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Obtain the results not yet obtained while not self.exception_caught.is_set() and n_active != 0: try: yield from self.get_result(block=True, timeout=0.1) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Clean up time exception_handler.raise_on_exception() self.stop_and_join()
def imap_unordered( self, func: Callable, iterable_of_args: Union[Sized, Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0, enable_insights: bool = False, worker_init: Optional[Callable] = None, worker_exit: Optional[Callable] = None ) -> Generator[Any, None, None]: """ Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in the queue. :param func: Function to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function ``func`` :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. If ``None`` it will be converted to ``n_jobs * 2`` :param chunk_size: Number of simultaneous tasks to give to a worker. When ``None`` it will use ``n_splits``. :param n_splits: Number of splits to use when ``chunk_size`` is ``None``. When both ``chunk_size`` and ``n_splits`` are ``None``, it will use ``n_splits = n_jobs * 64``. :param worker_lifespan: Number of tasks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :param enable_insights: Whether to enable worker insights. Might come at a small performance penalty (often neglible) :param worker_init: Function to call each time a new worker starts. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param worker_exit: Function to call each time a worker exits. Return values will be fetched and made available through :obj:`mpire.WorkerPool.get_exit_results`. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :return: Generator yielding unordered results """ # If we're dealing with numpy arrays, we have to chunk them here already iterator_of_chunked_args = [] if isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.params.n_jobs) # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be # modified as well n_tasks, max_tasks_active, chunk_size, progress_bar = self.params.check_map_parameters( iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position) # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays if not isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args = chunk_tasks( iterable_of_args, n_tasks, chunk_size, n_splits or self.params.n_jobs * 64) # Reset profiling stats self._worker_insights.reset_insights(enable_insights) # Start workers if there aren't any. If they already exist they must be restarted when either the function to # execute or the worker lifespan changes if self._workers and self.params.workers_need_restart( func, worker_init, worker_exit, worker_lifespan): self.stop_and_join() if not self._workers: logger.debug("Spinning up workers") self._start_workers(func, worker_init, worker_exit, worker_lifespan, bool(progress_bar)) # Create exception, exit results, and progress bar handlers. The exception handler receives any exceptions # thrown by the workers, terminates everything and re-raise an exception in the main process. The exit results # handler fetches results from the exit function, if provided. The progress bar handler receives progress # updates from the workers and updates the progress bar accordingly with ExceptionHandler(self.terminate, self._worker_comms, bool(progress_bar)) as exception_handler, \ ProgressBarHandler(func, self.params.n_jobs, progress_bar, n_tasks, progress_bar_position, self._worker_comms, self._worker_insights): # Process all args in the iterable n_active = 0 while not self._worker_comms.exception_caught(): # Add task, only if allowed and if there are any if n_active < max_tasks_active: try: self._worker_comms.add_task( next(iterator_of_chunked_args)) n_active += 1 except StopIteration: break # Check if new results are available, but don't wait for it try: yield from self._worker_comms.get_results(block=False) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Obtain the results not yet obtained while not self._worker_comms.exception_caught() and n_active != 0: try: yield from self._worker_comms.get_results(block=True, timeout=0.1) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Clean up time. When keep_alive is set to True we won't join the workers. During the stop_and_join call an # error can occur as well, so we have to check once again whether an exception occurred and raise if it did exception_handler.raise_on_exception() if not self.params.keep_alive: self.stop_and_join() exception_handler.raise_on_exception() # Log insights if enable_insights: logger.debug(self._worker_insights.get_insights_string())