def test_no_input_data(self): """ Test with empty iterable (this failed before) """ print() with WorkerPool() as pool: self.assertListEqual(pool.map(square, [], progress_bar=True), [])
def test_dictionary_input(self): """ Test map with dictionary input """ def subtract(x, y): return x - y with WorkerPool(n_jobs=1) as pool: # Should work with self.subTest('correct input'): results_list = pool.map(subtract, [{ 'x': 5, 'y': 2 }, { 'y': 5, 'x': 2 }]) self.assertEqual(results_list, [3, -3]) # Should throw with self.subTest("missing 'y', unknown parameter 'z'" ), self.assertRaises(TypeError): pool.map(subtract, [{'x': 5, 'z': 2}]) # Should throw with self.subTest("unknown parameter 'z'"), self.assertRaises( TypeError): pool.map(subtract, [{'x': 5, 'y': 2, 'z': 2}])
def test_invalid_input(self): """ Test that when parameters are invalid, an error is raised """ for n_jobs, cpu_ids in product( [None, 1, 2, 4], [[0, 1], [0, 1, 2, 3], [[0, 1], [0, 1]]]): if len(cpu_ids) != (n_jobs or cpu_count()): with self.subTest( n_jobs=n_jobs, cpu_ids=cpu_ids), self.assertRaises(ValueError): WorkerPool(n_jobs=n_jobs, cpu_ids=cpu_ids) # Should raise when CPU IDs are out of scope with self.assertRaises(ValueError): WorkerPool(n_jobs=1, cpu_ids=[-1]) with self.assertRaises(ValueError): WorkerPool(n_jobs=1, cpu_ids=[cpu_count()])
def test_deamon_nested_workerpool(self): """ Tests nested WorkerPools when daemon==True, which should not work """ with self.assertRaises(AssertionError), WorkerPool( n_jobs=4, daemon=True) as pool: pool.map(self._square_daemon, ((X, ) for X in repeat(self.test_data, 4)), chunk_size=1)
def test_invalid_progress_bar_position(self): """ Test different values of progress_bar_position, which should be positive integer >= 0 """ for progress_bar_position, error in [(-1, ValueError), ('numero uno', TypeError)]: with self.subTest(input='regular input', progress_bar_position=progress_bar_position), \ self.assertRaises(error), WorkerPool(n_jobs=1) as pool: pool.map(square, self.test_data, progress_bar=True, progress_bar_position=progress_bar_position) with self.subTest(input='numpy input', progress_bar_position=progress_bar_position), \ self.assertRaises(error), WorkerPool(n_jobs=1) as pool: pool.map(square_numpy, self.test_data_numpy, progress_bar=True, progress_bar_position=progress_bar_position)
def test_start_method(self): """ Test different start methods. All should work just fine """ for n_jobs, start_method in product( [1, 3], ['fork', 'forkserver', 'spawn', 'threading']): with self.subTest(n_jobs=n_jobs, start_method=start_method), \ WorkerPool(n_jobs, start_method=start_method) as pool: self.assertListEqual(pool.map(square, self.test_data), self.test_desired_output)
def test_valid_input(self): """ Test that when parameters are valid, nothing breaks. We don't actually check if CPU pinning is happening """ for n_jobs, cpu_ids in product([None, 1, 2, 4], [ None, [0], [1337], [0, 1], [0, 1, 2, 3], [[0, 3]], [[0, 1], [0, 1]] ]): # Things should work fine when cpu_ids is None or number of cpu_ids given is one or equals the number of # jobs if cpu_ids is None or len(cpu_ids) == 1 or len(cpu_ids) == ( n_jobs or cpu_count()): # When CPU IDs exceed the number of CPUs it should raise if cpu_ids is not None and np.array( cpu_ids).max() >= cpu_count(): with self.subTest(n_jobs=n_jobs, cpu_ids=cpu_ids), self.assertRaises(ValueError), \ WorkerPool(n_jobs=n_jobs, cpu_ids=cpu_ids) as pool: pool.map(square, self.test_data) else: with self.subTest(n_jobs=n_jobs, cpu_ids=cpu_ids), patch('subprocess.call') as p, \ WorkerPool(n_jobs=n_jobs, cpu_ids=cpu_ids) as pool: # Verify results results_list = pool.map(square, self.test_data) self.assertTrue(isinstance(results_list, list)) self.assertEqual(self.test_desired_output, results_list) # Verify that CPU pinning is used, is called as many times as there are jobs and is called for # each worker process ID if cpu_ids is None: self.assertEqual(p.call_args_list, []) else: self.assertEqual(p.call_count, pool.n_jobs) pids = { call[0][0].rsplit(" ", 1)[-1] for call in p.call_args_list } self.assertEqual(len(pids), pool.n_jobs)
def test_exceptions(self): """ Tests if MPIRE can handle exceptions well """ # This print statement is intentional as it will print multiple progress bars print() for n_jobs, n_tasks_max_active, worker_lifespan, progress_bar in product( [1, 20], [None, 1], [None, 1], [False, True]): with WorkerPool(n_jobs=n_jobs) as pool: # Should work for map like functions with self.subTest(n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar, function='square_raises', map='map'), \ self.assertRaises(ValueError): pool.map(self._square_raises, self.test_data, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar) # Should work for imap like functions with self.subTest(n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar, function='square_raises', map='imap'), \ self.assertRaises(ValueError): list( pool.imap_unordered( self._square_raises, self.test_data, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar)) # Should work for map like functions with self.subTest(n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar, function='square_raises_on_idx', map='map'), \ self.assertRaises(ValueError): pool.map(self._square_raises_on_idx, self.test_data, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar) # Should work for imap like functions with self.subTest(n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar, function='square_raises_on_idx', map='imap'), \ self.assertRaises(ValueError): list( pool.imap_unordered( self._square_raises_on_idx, self.test_data, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, progress_bar=progress_bar))
def test_by_constructor(self): """ Test setting passing on the worker ID in the constructor """ for n_jobs, pass_worker_id in product([1, 2, 4], [True, False]): with self.subTest(n_jobs=n_jobs, pass_worker_id=pass_worker_id, config_type='constructor'), \ WorkerPool(n_jobs=n_jobs, pass_worker_id=pass_worker_id) as pool: # Tests should fail when number of arguments in function is incorrect, worker ID is not within range, # or when the shared objects are not equal to the given arguments f = self._f1 if pass_worker_id else self._f2 pool.map(f, ((n_jobs, ) for _ in range(10)), iterable_len=10)
def test_non_deamon_nested_workerpool(self): """ Tests nested WorkerPools when daemon==False, which should work """ with WorkerPool(n_jobs=4, daemon=False) as pool: # Obtain results using nested WorkerPools results = pool.map(self._square_daemon, ((X, ) for X in repeat(self.test_data, 4)), chunk_size=1) # Each of the results should match for results_list in results: self.assertTrue(isinstance(results_list, list)) self.assertEqual(self.test_desired_output, results_list)
def test_valid_progress_bars_regular_input(self): """ Valid progress bars are either False/True """ print() for n_jobs, progress_bar in product([None, 1, 2], [True, False]): with self.subTest(n_jobs=n_jobs), WorkerPool( n_jobs=n_jobs) as pool: results_list = pool.map(square, self.test_data, progress_bar=progress_bar) self.assertTrue(isinstance(results_list, list)) self.assertEqual(self.test_desired_output, results_list)
def test_by_constructor(self): """ Tests passing shared objects in the constructor """ for n_jobs, shared_objects in product( [1, 2, 4], [None, (37, 42), ({'1', '2', '3'})]): # Pass on arguments using the constructor instead with self.subTest(n_jobs=n_jobs, shared_objects=shared_objects, config_type='constructor'), \ WorkerPool(n_jobs=n_jobs, shared_objects=shared_objects) as pool: # Tests should fail when number of arguments in function is incorrect, worker ID is not within range, # or when the shared objects are not equal to the given arguments f = self._f1 if shared_objects else self._f2 pool.map(f, ((shared_objects, n_jobs) for _ in range(10)), iterable_len=10)
def predict(self, texts: List[str], pbar: bool = True) -> List[List[Instance]]: """ Predict the semtypes for each text. :param texts: The texts for which to predict. :return: A list of tuples containing start and end index and label. """ with WorkerPool(n_jobs=self.n_workers) as pool: pred = pool.map(self._predict_single, texts, chunk_size=1, progress_bar=pbar) return pred
def test_valid_progress_bars_numpy_input(self): """ Test with numpy, as that will change the number of tasks """ print() for n_jobs, progress_bar in product([None, 1, 2], [True, False]): # Should work just fine with self.subTest(n_jobs=n_jobs, progress_bar=progress_bar), WorkerPool( n_jobs=n_jobs) as pool: results = pool.map(square_numpy, self.test_data_numpy, progress_bar=progress_bar) self.assertTrue(isinstance(results, np.ndarray)) np.testing.assert_array_equal(results, self.test_desired_output_numpy)
def test_faulty_parameters(self): """ Should raise when wrong parameter values are used """ with WorkerPool(n_jobs=4) as pool: # Zero (or a negative number of) active tasks/lifespan should result in a value error for n, map_function in product( [-3, -1, 0, 3.14], [pool.map, pool.map_unordered, pool.imap, pool.imap_unordered ]): # max_tasks_active with self.subTest(max_tasks_active=n, map_function=map_function), \ self.assertRaises(ValueError if isinstance(n, int) else TypeError): list( map_function(square, self.test_data, max_tasks_active=n)) # worker_lifespan with self.subTest(worker_lifespan=n, map_function=map_function), \ self.assertRaises(ValueError if isinstance(n, int) else TypeError): list( map_function(square, self.test_data, worker_lifespan=n)) # chunk_size should be an integer or None with self.subTest(chunk_size='3'), self.assertRaises(TypeError): for _ in pool.imap(square, self.test_data, chunk_size='3'): pass # chunk_size should be a positive integer with self.subTest(chunk_size=-5), self.assertRaises(ValueError): for _ in pool.imap(square, self.test_data, chunk_size=-5): pass # n_splits should be an integer or None with self.subTest(n_splits='3'), self.assertRaises(TypeError): for _ in pool.imap(square, self.test_data, n_splits='3'): pass # n_splits should be a positive integer with self.subTest(n_splits=-5), self.assertRaises(ValueError): for _ in pool.imap(square, self.test_data, n_splits=-5): pass
def test_enable_insights(self): """ Insight containers are initially set to None values. When enabled they should be changed to appropriate containers. When a second task is started it should reset them. If disabled, they should remain None """ with WorkerPool(n_jobs=2) as pool: # We run this a few times to see if it resets properly. We only verify this by checking the # n_completed_tasks for idx in range(3): with self.subTest('enabled', idx=idx): pool.map(square, range(10), enable_insights=True, worker_init=self._init, worker_exit=self._exit) # Basic sanity checks for the values. Some max task args can be empty, in that case the duration # should be 0 (= no data) self.assertGreater(sum(pool._worker_insights.worker_start_up_time), 0) self.assertGreater(sum(pool._worker_insights.worker_init_time), 0) self.assertEqual(sum(pool._worker_insights.worker_n_completed_tasks), 10) self.assertGreater(sum(pool._worker_insights.worker_waiting_time), 0) self.assertGreater(sum(pool._worker_insights.worker_working_time), 0) self.assertGreater(sum(pool._worker_insights.worker_exit_time), 0) self.assertGreater(max(pool._worker_insights.max_task_duration), 0) for duration, args in zip(pool._worker_insights.max_task_duration, pool._worker_insights.max_task_args): if duration == 0: self.assertEqual(args, '') else: self.assertIn(args, {'Arg 0: 0', 'Arg 0: 1', 'Arg 0: 2', 'Arg 0: 3', 'Arg 0: 4', 'Arg 0: 5', 'Arg 0: 6', 'Arg 0: 7', 'Arg 0: 8', 'Arg 0: 9'}) # Disabling should set things to None again with self.subTest('disable'): pool.map(square, range(10), enable_insights=False) self.assertIsNone(pool._worker_insights.insights_manager) self.assertIsNone(pool._worker_insights.insights_manager_lock) self.assertIsNone(pool._worker_insights.worker_start_up_time) self.assertIsNone(pool._worker_insights.worker_init_time) self.assertIsNone(pool._worker_insights.worker_n_completed_tasks) self.assertIsNone(pool._worker_insights.worker_waiting_time) self.assertIsNone(pool._worker_insights.worker_working_time) self.assertIsNone(pool._worker_insights.worker_exit_time) self.assertIsNone(pool._worker_insights.max_task_duration) self.assertIsNone(pool._worker_insights.max_task_args)
def test_by_constructor(self): """ Tests setting worker state in the constructor """ for n_jobs, use_worker_state, n_tasks in product([1, 2, 4], [False, True], [0, 1, 3, 150]): with self.subTest(n_jobs=n_jobs, use_worker_state=use_worker_state, n_tasks=n_tasks), \ WorkerPool(n_jobs=n_jobs, pass_worker_id=True, use_worker_state=use_worker_state) as pool: # When use_worker_state is set, the final (worker_id, n_args) of each worker should add up to the # number of given tasks f = self._f1 if use_worker_state else self._f2 results = pool.map(f, range(n_tasks), chunk_size=2) if use_worker_state: n_processed_per_worker = [0] * n_jobs for wid, n_processed in results: n_processed_per_worker[wid] = n_processed self.assertEqual(sum(n_processed_per_worker), n_tasks)
futures = client.map(slow_fn, jobs) res = client.gather(futures) print(f'Elapsed time for {name.upper()}: {time.time() - start_time}') print(f'\n{"=" * 53}\n') # 7: p_tqdm start_time = time.time() name = 'p_tqdm' print(f'Start {name.upper()} execution...') res = p_map(slow_fn, jobs) print(f'Elapsed time for {name.upper()}: {time.time() - start_time}') print(f'\n{"=" * 53}\n') # 8: pathos start_time = time.time() name = 'pathos' print(f'Start {name.upper()} execution...') pool = ProcessPool() res = pool.map(slow_fn, jobs, chunksize=16) print(f'Elapsed time for {name.upper()}: {time.time() - start_time}') print(f'\n{"=" * 53}\n') # 9: mpire start_time = time.time() name = 'mpire' print(f'Start {name.upper()} execution...') with WorkerPool(n_jobs=5) as pool: res = pool.map(slow_fn, jobs, chunk_size=16) print(f'Elapsed time for {name.upper()}: {time.time() - start_time}') print(f'\n{"=" * 53}\n')
def test_numpy_input(self): """ Test map with numpy input """ for n_jobs, n_tasks_max_active, worker_lifespan, chunk_size, n_splits in \ product([1, 2, None], [None, 2], [None, 2], [None, 3], [None, 3]): with WorkerPool(n_jobs=n_jobs) as pool: # Test numpy input. map should concatenate chunks of numpy output to a single output array if we # instruct it to with self.subTest(concatenate_numpy_output=True, map_function='map', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results = pool.map(square_numpy, self.test_data_numpy, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, concatenate_numpy_output=True) self.assertTrue(isinstance(results, np.ndarray)) np.testing.assert_array_equal( results, self.test_desired_output_numpy) # If we disable it we should get back chunks of the original array with self.subTest(concatenate_numpy_output=False, map_function='map', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results = pool.map(square_numpy, self.test_data_numpy, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan, concatenate_numpy_output=False) self.assertTrue(isinstance(results, list)) np.testing.assert_array_equal( np.concatenate(results), self.test_desired_output_numpy) # Numpy concatenation doesn't exist for the other functions with self.subTest(map_function='imap', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results = pool.imap(square_numpy, self.test_data_numpy, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan) self.assertTrue(isinstance(results, types.GeneratorType)) np.testing.assert_array_equal( np.concatenate(list(results)), self.test_desired_output_numpy) # map_unordered and imap_unordered cannot be checked for correctness as we don't know the order of the # returned results, except when n_jobs=1. In the other cases we could, however, check if all the values # (numpy rows) that are returned are present (albeit being in a different order) for map_func, result_type in ((pool.map_unordered, list), (pool.imap_unordered, types.GeneratorType)): with self.subTest(map_function=map_func, n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results = map_func(square_numpy, self.test_data_numpy, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan) self.assertTrue(isinstance(results, result_type)) concattenated_results = np.concatenate(list(results)) if n_jobs == 1: np.testing.assert_array_equal( concattenated_results, self.test_desired_output_numpy) else: # We sort the expected and actual results using lexsort, which sorts using a sequence of # keys. We transpose the array to sort on columns instead of rows. np.testing.assert_array_equal( concattenated_results[np.lexsort( concattenated_results.T)], self.test_desired_output_numpy[np.lexsort( self.test_desired_output_numpy.T)])
def test_all_maps(self): """ Tests the map related functions """ def get_generator(iterable): yield from iterable # Test results for different number of jobs to run in parallel and the maximum number of active tasks in the # queue for n_jobs, n_tasks_max_active, worker_lifespan, chunk_size, n_splits in \ product([1, 2, None], [None, 2], [None, 2], [None, 3], [None, 3]): with WorkerPool(n_jobs=n_jobs) as pool: for map_func, sort, result_type in ((pool.map, False, list), (pool.map_unordered, True, list), (pool.imap, False, types.GeneratorType), (pool.imap_unordered, True, types.GeneratorType)): with self.subTest(map_func=map_func, input='list', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): # Test if parallel map results in the same as ordinary map function. Should work both for # generators and iterators. Also check if an empty list works as desired. results_list = map_func( square, self.test_data, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan) self.assertTrue(isinstance(results_list, result_type)) self.assertEqual( self.test_desired_output, sorted(results_list, key=lambda tup: tup[0]) if sort else list(results_list)) with self.subTest(map_func=map_func, input='generator', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results_list = map_func( square, get_generator(self.test_data), iterable_len=self.test_data_len, max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan) self.assertTrue(isinstance(results_list, result_type)) self.assertEqual( self.test_desired_output, sorted(results_list, key=lambda tup: tup[0]) if sort else list(results_list)) with self.subTest(map_func=map_func, input='empty list', n_jobs=n_jobs, n_tasks_max_active=n_tasks_max_active, worker_lifespan=worker_lifespan, chunk_size=chunk_size, n_splits=n_splits): results_list = map_func( square, [], max_tasks_active=n_tasks_max_active, worker_lifespan=worker_lifespan) self.assertTrue(isinstance(results_list, result_type)) self.assertEqual([], list(results_list))
def _square_daemon(X): with WorkerPool(n_jobs=4) as pool: return pool.map(square, X, chunk_size=1)