def main(): """ Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10. Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2. """ sleep_time = 0.02 p = SequentialQueuedPipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ], n_workers_per_step=8, max_queue_size=10, batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a print('SequentialQueuedPipeline') print('execution time: {} seconds'.format(time_queued_pipeline)) """ Process data inputs sequentially. For each data input, sleep 0.02 seconds, and then multiply by 2. """ p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a print('VanillaPipeline') print('execution time: {} seconds'.format(time_vanilla_pipeline)) assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_parallel_queued_parallelize_correctly(): sleep_time = 0.001 p = SequentialQueuedPipeline([ ('1', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])) ], batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def main(): """ The task is to sleep 0.02 seconds for each data input and then multiply by 2. """ sleep_time = 0.02 preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)] # Classical pipeline - all at once with one big batch: p = Pipeline(preprocessing_and_model_steps) time_vanilla_pipeline, output_classical = eval_run_time(p) print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.") # Classical minibatch pipeline - minibatch size 10: p = MiniBatchSequentialPipeline(preprocessing_and_model_steps, batch_size=10) time_minibatch_pipeline, output_minibatch = eval_run_time(p) print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.") # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that # have a max queue size of 10 batches between preprocessing and the model: p = SequentialQueuedPipeline(preprocessing_and_model_steps, n_workers_per_step=16, max_queue_size=10, batch_size=10) time_parallel_pipeline, output_parallel = eval_run_time(p) print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.") assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline)) assert np.array_equal(output_classical, output_minibatch) assert np.array_equal(output_classical, output_parallel)
def main(): value_caching_folder = 'value_caching' if not os.path.exists(value_caching_folder): os.makedirs(value_caching_folder) data_inputs = list(range(100)) sleep_time = 0.001 a = time.time() for i in range(5): p = Pipeline([ PickleValueCachingWrapper(ForEach( Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), cache_folder=value_caching_folder) ]) outputs_value_caching = p.transform(data_inputs) b = time.time() time_value_caching_pipeline = b - a print('Pipeline with ValueCachingWrapper') print('execution time: {} seconds'.format(time_value_caching_pipeline)) a = time.time() for i in range(5): p = Pipeline([ ForEach(Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), ]) outputs_vanilla = p.transform(data_inputs) b = time.time() time_vanilla_pipeline = b - a print('Pipeline without value caching') print('execution time: {} seconds'.format(time_vanilla_pipeline)) shutil.rmtree(value_caching_folder) assert np.array_equal(outputs_value_caching, outputs_vanilla) assert time_value_caching_pipeline < time_vanilla_pipeline
def test_parallel_queued_parallelize_correctly(tmpdir, use_processes, use_savers): sleep_time = 0.01 p = SequentialQueuedPipeline( [('1', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]))], batch_size=10, use_processes=use_processes, use_savers=use_savers).with_context(ExecutionContext(tmpdir)) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ]).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') pipeline = ResumablePipeline( [('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)