def main(): """ Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10. Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2. """ sleep_time = 0.02 p = SequentialQueuedPipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ], n_workers_per_step=8, max_queue_size=10, batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a print('SequentialQueuedPipeline') print('execution time: {} seconds'.format(time_queued_pipeline)) """ Process data inputs sequentially. For each data input, sleep 0.02 seconds, and then multiply by 2. """ p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a print('VanillaPipeline') print('execution time: {} seconds'.format(time_vanilla_pipeline)) assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_parallel_queued_parallelize_correctly(): sleep_time = 0.001 p = SequentialQueuedPipeline([ ('1', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])) ], batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_queued_pipeline_with_n_workers_step(): p = SequentialQueuedPipeline([(1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2))], batch_size=10, max_queue_size=5) outputs = p.transform(list(range(100))) assert np.array_equal(outputs, EXPECTED_OUTPUTS)
def test_queued_pipeline_with_step_name_n_worker_max_queue_size(): p = SequentialQueuedPipeline([('1', 1, 5, MultiplyByN(2)), ('2', 1, 5, MultiplyByN(2)), ('3', 1, 5, MultiplyByN(2)), ('4', 1, 5, MultiplyByN(2))], batch_size=10) outputs = p.transform(list(range(100))) assert np.array_equal(outputs, EXPECTED_OUTPUTS)
def test_queued_pipeline_with_excluded_incomplete_batch(): p = SequentialQueuedPipeline([ MultiplyByN(2), MultiplyByN(2), MultiplyByN(2), MultiplyByN(2) ], batch_size=10, include_incomplete_batch=False, n_workers_per_step=1, max_queue_size=5) outputs = p.transform(list(range(15))) assert np.array_equal(outputs, np.array(list(range(10))) * 2 * 2 * 2 * 2)
def test_queued_pipeline_with_included_incomplete_batch(): p = SequentialQueuedPipeline( [MultiplyByN(2), MultiplyByN(2), MultiplyByN(2), MultiplyByN(2)], batch_size=10, keep_incomplete_batch=True, default_value_data_inputs=AbsentValuesNullObject(), default_value_expected_outputs=AbsentValuesNullObject(), n_workers_per_step=1, max_queue_size=5) outputs = p.transform(list(range(15))) assert np.array_equal(outputs, np.array(list(range(15))) * 2 * 2 * 2 * 2)
def main(): """ The task is to sleep 0.02 seconds for each data input and then multiply by 2. """ sleep_time = 0.02 preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)] # Classical pipeline - all at once with one big batch: p = Pipeline(preprocessing_and_model_steps) time_vanilla_pipeline, output_classical = eval_run_time(p) print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.") # Classical minibatch pipeline - minibatch size 10: p = MiniBatchSequentialPipeline(preprocessing_and_model_steps, batch_size=10) time_minibatch_pipeline, output_minibatch = eval_run_time(p) print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.") # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that # have a max queue size of 10 batches between preprocessing and the model: p = SequentialQueuedPipeline(preprocessing_and_model_steps, n_workers_per_step=16, max_queue_size=10, batch_size=10) time_parallel_pipeline, output_parallel = eval_run_time(p) print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.") assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline)) assert np.array_equal(output_classical, output_minibatch) assert np.array_equal(output_classical, output_parallel)
def test_queued_pipeline_with_step_with_threading(): p = SequentialQueuedPipeline( [MultiplyByN(2), MultiplyByN(2), MultiplyByN(2), MultiplyByN(2)], batch_size=10, n_workers_per_step=1, max_queue_size=5, use_processes=False) data_container = DataContainer(data_inputs=list(range(100))) context = ExecutionContext() outputs = p.handle_transform(data_container, context) assert np.array_equal(outputs.data_inputs, EXPECTED_OUTPUTS)
def test_queued_pipeline_with_included_incomplete_batch_that_raises_an_exception( ): with pytest.raises(AttributeError): p = SequentialQueuedPipeline( [MultiplyByN(2), MultiplyByN(2), MultiplyByN(2), MultiplyByN(2)], batch_size=10, keep_incomplete_batch=True, default_value_data_inputs= None, # this will raise an exception in the worker default_value_expected_outputs= None, # this will raise an exception in the worker n_workers_per_step=1, max_queue_size=5) p.transform(list(range(15)))
def test_sequential_queued_pipeline_should_fit_transform_without_multiprocessing( ): batch_size = 10 p = SequentialQueuedPipeline( [(1, FitTransformCallbackStep( transform_function=lambda di: np.array(di) * 2)), (1, FitTransformCallbackStep( transform_function=lambda di: np.array(di) * 2)), (1, FitTransformCallbackStep( transform_function=lambda di: np.array(di) * 2)), (1, FitTransformCallbackStep( transform_function=lambda di: np.array(di) * 2))], batch_size=batch_size, max_queue_size=5) queue_joiner_for_test = QueueJoinerForTest(batch_size=batch_size) p.steps[-1] = queue_joiner_for_test p.steps_as_tuple[-1] = (p.steps_as_tuple[-1][0], queue_joiner_for_test) p._refresh_steps() p, outputs = p.fit_transform(list(range(100)), list(range(100))) assert not p[-1].called_queue_joiner assert np.array_equal(outputs, EXPECTED_OUTPUTS)
def test_parallel_queued_parallelize_correctly(tmpdir, use_processes, use_savers): sleep_time = 0.01 p = SequentialQueuedPipeline( [('1', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]))], batch_size=10, use_processes=use_processes, use_savers=use_savers).with_context(ExecutionContext(tmpdir)) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_sequential_queued_pipeline_should_fit_without_multiprocessing(): batch_size = 10 p = SequentialQueuedPipeline([(1, FitTransformCallbackStep()), (1, FitTransformCallbackStep()), (1, FitTransformCallbackStep()), (1, FitTransformCallbackStep())], batch_size=batch_size, max_queue_size=5) queue_joiner_for_test = QueueJoinerForTest(batch_size=batch_size) p.steps[-1] = queue_joiner_for_test p.steps_as_tuple[-1] = (p.steps_as_tuple[-1][0], queue_joiner_for_test) p._refresh_steps() p = p.fit(list(range(100)), list(range(100))) assert not p[-1].called_queue_joiner