def main(): """ Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10. Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2. """ sleep_time = 0.02 p = SequentialQueuedPipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ], n_workers_per_step=8, max_queue_size=10, batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a print('SequentialQueuedPipeline') print('execution time: {} seconds'.format(time_queued_pipeline)) """ Process data inputs sequentially. For each data input, sleep 0.02 seconds, and then multiply by 2. """ p = Pipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a print('VanillaPipeline') print('execution time: {} seconds'.format(time_vanilla_pipeline)) assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def main(): """ The task is to sleep 0.02 seconds for each data input and then multiply by 2. """ sleep_time = 0.02 preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)] # Classical pipeline - all at once with one big batch: p = Pipeline(preprocessing_and_model_steps) time_vanilla_pipeline, output_classical = eval_run_time(p) print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.") # Classical minibatch pipeline - minibatch size 10: p = MiniBatchSequentialPipeline(preprocessing_and_model_steps, batch_size=10) time_minibatch_pipeline, output_minibatch = eval_run_time(p) print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.") # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that # have a max queue size of 10 batches between preprocessing and the model: p = SequentialQueuedPipeline(preprocessing_and_model_steps, n_workers_per_step=16, max_queue_size=10, batch_size=10) time_parallel_pipeline, output_parallel = eval_run_time(p) print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.") assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline)) assert np.array_equal(output_classical, output_minibatch) assert np.array_equal(output_classical, output_parallel)
def __init__(self, columns_selection, n_dimension=3): assert n_dimension >= 2 col_selector: ColumnSelector2D = ColumnSelector2D(columns_selection=columns_selection) for _ in range(min(0, n_dimension - 2)): col_selector = ForEach(col_selector) MetaStep.__init__(self, col_selector) self.n_dimension = n_dimension
def main(): value_caching_folder = 'value_caching' if not os.path.exists(value_caching_folder): os.makedirs(value_caching_folder) data_inputs = list(range(100)) sleep_time = 0.001 a = time.time() for i in range(5): p = Pipeline([ PickleValueCachingWrapper(ForEach( Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), cache_folder=value_caching_folder) ]) outputs_value_caching = p.transform(data_inputs) b = time.time() time_value_caching_pipeline = b - a print('Pipeline with ValueCachingWrapper') print('execution time: {} seconds'.format(time_value_caching_pipeline)) a = time.time() for i in range(5): p = Pipeline([ ForEach(Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), ]) outputs_vanilla = p.transform(data_inputs) b = time.time() time_vanilla_pipeline = b - a print('Pipeline without value caching') print('execution time: {} seconds'.format(time_vanilla_pipeline)) shutil.rmtree(value_caching_folder) assert np.array_equal(outputs_value_caching, outputs_vanilla) assert time_value_caching_pipeline < time_vanilla_pipeline
def test_transform_should_transform_all_steps_for_each_data_inputs_expected_outputs( ): tape = TapeCallbackFunction() p = Pipeline([ ForEach( Pipeline([ TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), ])) ]) data_inputs = [[0, 1], [1, 2]] outputs = p.transform(data_inputs) assert tape.get_name_tape() == ["1", "2", "1", "2"]
def test_parallel_queued_parallelize_correctly(tmpdir, use_processes, use_savers): sleep_time = 0.01 p = SequentialQueuedPipeline( [('1', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 2, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]))], batch_size=10, use_processes=use_processes, use_savers=use_savers).with_context(ExecutionContext(tmpdir)) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_fit_for_each_should_fit_all_steps_for_each_data_inputs_expected_outputs( ): tape = TapeCallbackFunction() p = Pipeline([ ForEach( Pipeline([ FitCallbackStep(tape.callback, ["1"]), FitCallbackStep(tape.callback, ["2"]), ])) ]) data_inputs = [[0, 1], [1, 2]] expected_outputs = [[2, 3], [4, 5]] p = p.fit(data_inputs, expected_outputs) assert isinstance(p, Pipeline) assert tape.get_name_tape() == ["1", "2", "1", "2"] assert tape.data == [([0, 1], [2, 3]), ([0, 1], [2, 3]), ([1, 2], [4, 5]), ([1, 2], [4, 5])]
def test_parallel_queued_parallelize_correctly(): sleep_time = 0.001 p = SequentialQueuedPipeline( [('1', 4, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 4, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 4, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 4, 10, Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]))], batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_fit_transform_should_fit_transform_all_steps_for_each_data_inputs_expected_outputs( ): tape = TapeCallbackFunction() tape_fit = TapeCallbackFunction() p = Pipeline([ ForEach( Pipeline([ FitTransformCallbackStep(tape.callback, tape_fit, ["1"]), FitTransformCallbackStep(tape.callback, tape_fit, ["2"]), ])) ]) data_inputs = [[0, 1], [1, 2]] expected_outputs = [[2, 3], [4, 5]] p, outputs = p.fit_transform(data_inputs, expected_outputs) assert tape.get_name_tape() == ["1", "2", "1", "2"] assert tape_fit.get_name_tape() == ["1", "2", "1", "2"] assert tape_fit.data == [([0, 1], [2, 3]), ([0, 1], [2, 3]), ([1, 2], [4, 5]), ([1, 2], [4, 5])]
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEach(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEach(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs2 = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs2) print('{0} seconds'.format(time_b - time_a)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) assert (outputs == outputs2).all()