def test_model_stacking_fit_transform(): model_stacking = Pipeline([ ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) expected_outputs_shape = (379, 1) data_inputs_shape = (379, 13) data_inputs = _create_data(data_inputs_shape) expected_outputs = _create_data(expected_outputs_shape) model_stacking, outputs = model_stacking.fit_transform( data_inputs, expected_outputs) assert outputs.shape == expected_outputs_shape
def test_logger(): file_path = "test.log" if os.path.exists(file_path): os.remove(file_path) # Given logger = logging.getLogger('test') file_handler = logging.FileHandler(file_path) file_handler.setLevel('DEBUG') logger.addHandler(file_handler) logger.setLevel('DEBUG') context = ExecutionContext(logger=logger) pipeline = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pipeline.handle_fit(data_container, context) # Then assert os.path.exists(file_path) with open(file_path) as f: l = f.read() # Teardown file_handler.close() os.remove(file_path)
def __init__(self, json_decoder: JSONDataBodyDecoder, wrapped: BaseStep, json_encoder: JSONDataResponseEncoder, route='/'): Pipeline.__init__(self, [json_decoder, wrapped, json_encoder]) self.route: str = route
def main(): p = Pipeline([ ForceAlwaysAlwaysHandleMixinStep(), ]) p = p.fit(np.array([0, 1]), np.array([0, 1])) p = p.transform(np.array([0, 1]))
def main(): """ Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10. Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2. """ sleep_time = 0.02 p = SequentialQueuedPipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ], n_workers_per_step=8, max_queue_size=10, batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a print('SequentialQueuedPipeline') print('execution time: {} seconds'.format(time_queued_pipeline)) """ Process data inputs sequentially. For each data input, sleep 0.02 seconds, and then multiply by 2. """ p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a print('VanillaPipeline') print('execution time: {} seconds'.format(time_vanilla_pipeline)) assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_load_full_dump_from_path(tmpdir): # Given tape_fit_callback_function = TapeCallbackFunction() tape_transform_callback_function = TapeCallbackFunction() pipeline = Pipeline( [('step_a', Identity()), ('step_b', OutputTransformerWrapper( FitTransformCallbackStep(tape_fit_callback_function, tape_transform_callback_function)))], cache_folder=tmpdir).set_name(PIPELINE_NAME) # When pipeline, outputs = pipeline.fit_transform(DATA_INPUTS, EXPECTED_OUTPUTS) pipeline.save(ExecutionContext(tmpdir), full_dump=True) # Then loaded_pipeline = ExecutionContext(tmpdir).load( os.path.join(PIPELINE_NAME, 'step_b')) assert isinstance(loaded_pipeline, OutputTransformerWrapper) loaded_step_b_wrapped_step = loaded_pipeline.wrapped assert np.array_equal( loaded_step_b_wrapped_step.transform_callback_function.data[0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][1], [None] * len(EXPECTED_OUTPUTS))
def test_inner_concatenate_data_should_merge_2d_with_3d(): # Given data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D) data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D) data_container_2d = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) data_container_3d = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \ .add_sub_data_container('2d', data_container_2d) # When p = Pipeline( [InnerConcatenateDataContainer(sub_data_container_names=['2d'])]) data_container_3d = p.handle_transform(data_container_3d, ExecutionContext()) # Then assert data_container_3d.data_inputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert data_container_3d.expected_outputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert np.array_equal(data_container_3d.data_inputs[..., -1], data_container_2d.data_inputs) assert np.array_equal(data_container_3d.expected_outputs[..., -1], data_container_2d.expected_outputs)
def test_inner_concatenate_data_should_merge_1d_with_3d(): # Given data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D) data_inputs_1d, expected_outputs_1d = _create_data_source(SHAPE_1D) data_container_1d = DataContainer(data_inputs=data_inputs_1d, expected_outputs=expected_outputs_1d) data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \ .add_sub_data_container('1d', data_container_1d) # When p = Pipeline( [InnerConcatenateDataContainer(sub_data_container_names=['1d'])]) data_container = p.handle_transform(data_container, ExecutionContext()) # Then broadcasted_data_inputs_1d = np.broadcast_to( np.expand_dims(data_container_1d.data_inputs, axis=-1), shape=(SHAPE_3D[0], SHAPE_3D[1])) broadcasted_expected_outputs_1d = np.broadcast_to( np.expand_dims(data_container_1d.expected_outputs, axis=-1), shape=(SHAPE_3D[0], SHAPE_3D[1])) assert np.array_equal(data_container.data_inputs[..., -1], broadcasted_data_inputs_1d) assert np.array_equal(data_container.expected_outputs[..., -1], broadcasted_expected_outputs_1d) assert data_container.data_inputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert data_container.expected_outputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1)
def test_expectedoutputnull_is_fine_when_null(tmpdir): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = None p = Pipeline([SomeStep()]) p.fit_transform(data_inputs,expected_outputs)
def test_pipeline_nested_mutate_inverse_transform_without_identities(): """ This test was required for a strange bug at the border of the pipelines that happened when the identities were not used. """ expected_tape = ["1", "2", "3", "4", "5", "6", "7", "7", "6", "5", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), Pipeline([ TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), TransformCallbackStep(tape.callback, ["5"]), ]), TransformCallbackStep(tape.callback, ["6"]), TransformCallbackStep(tape.callback, ["7"]), ]) p, _ = p.fit_transform(np.ones((1, 1))) # will add range(1, 8) to tape. print("[mutating, inversing, and calling each inverse_transform]") reversed(p).transform(np.ones((1, 1))) # will add reversed(range(1, 8)) to tape, calling inverse_transforms. print(expected_tape) print(tape.get_name_tape()) assert expected_tape == tape.get_name_tape()
def test_pipeline_setup_incrementally(): class SomeStepThatFits(NonTransformableMixin, BaseStep): def __init__(self): BaseStep.__init__(self) self.has_fitted = False def fit(self, data_inputs, expected_outputs=None) -> _FittableStep: self.has_fitted = True return self class StepWithSensitiveSetup(Identity): """ Asserts that step given in argument has fitted before performing setup""" def __init__(self): Identity.__init__(self) def setup(self, context: ExecutionContext = None) -> BaseTransformer: assert some_step.has_fitted is True assert some_step2.has_fitted is False return self some_step = SomeStepThatFits() some_step2 = SomeStepThatFits() p = Pipeline([some_step, StepWithSensitiveSetup(), some_step2]) p.fit_transform(None, None)
def test_apply_on_pipeline_with_positional_argument_should_call_method_on_each_steps(): pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)]) pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2})) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
def test_pipeline_set_one_hyperparam_level_one_flat(): p = Pipeline([("a", SomeStep()), ("b", SomeStep()), ("c", SomeStep())]) p.set_hyperparams({"a__learning_rate": 7}) assert p["a"].hyperparams.to_flat_as_dict_primitive()["learning_rate"] == 7 assert p["b"].hyperparams.to_flat_as_dict_primitive() == dict() assert p["c"].hyperparams.to_flat_as_dict_primitive() == dict()
def test_apply_on_pipeline_with_meta_step_and_positional_argument_should_call_method_on_each_steps(): pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)]) pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2})) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
def test_pipeline_set_one_hyperparam_level_one_dict(): p = Pipeline([("a", SomeStep()), ("b", SomeStep()), ("c", SomeStep())]) p.set_hyperparams({"b": {"learning_rate": 7}}) assert p["a"].hyperparams == dict() assert p["b"].hyperparams["learning_rate"] == 7 assert p["c"].hyperparams == dict()
def test_pipeline_fit_transform(steps_list, pipeline_runner): data_input_ = [AN_INPUT] expected_output_ = [AN_EXPECTED_OUTPUT] p = Pipeline(steps_list, pipeline_runner=pipeline_runner()) p, result = p.fit_transform(data_input_, expected_output_) assert tuple(result) == tuple(expected_output_)
def test_expectedoutputnull_raise_exception_when_notnull(tmpdir): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 p = Pipeline([AssertExpectedOutputIsNone()]) with pytest.raises(AssertionError) as error_info: p.fit_transform(data_inputs, expected_outputs)
def test_apply_method_on_pipeline_should_call_method_on_each_steps(): pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)]) pipeline.apply_method(lambda step: step.set_hyperparams( HyperparameterSamples({'multiply_by': 2}))) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
def main(): p = Pipeline([MultiplyByN(multiply_by=2)]) data_inputs = np.array([1, 2]) generated_outputs = p.transform(data_inputs) regenerated_inputs = p.inverse_transform(generated_outputs) assert np.array_equal(regenerated_inputs, data_inputs) assert np.array_equal(generated_outputs, 2 * data_inputs)
def test_pipeline_fit_then_transform(steps_list): data_input_ = [AN_INPUT] expected_output_ = [AN_EXPECTED_OUTPUT] p = Pipeline(steps_list) p = p.fit(data_input_, expected_output_) result = p.transform(data_input_) assert tuple(result) == tuple(expected_output_)
def test_apply_method_on_pipeline_with_meta_step_should_call_method_on_each_steps(): pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)]) pipeline.apply_method( lambda step: step.set_hyperparams(HyperparameterSamples({'multiply_by': 2})) ) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
def main(): p = Pipeline([ NonFittableStep(), NonTransformableStep(), Identity() # Note: Identity does nothing: it inherits from both NonFittableMixin and NonTransformableMixin. ]) p = p.fit(np.array([0, 1]), np.array([0, 1])) out = p.transform(np.array([0, 1]))
def test_add_service_assertions_should_fail_when_services_are_missing(tmpdir): with pytest.raises(AssertionError) as exception_info: context = ExecutionContext(root=tmpdir) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) data_inputs = np.array([0, 1, 2, 3]) p.transform(data_inputs=data_inputs) assert 'BaseService dependency missing' in exception_info.value.args[0]
def main(): p = Pipeline([ ('step1', MultiplyByN()), ('step2', MultiplyByN()), Pipeline([ Identity(), Identity(), PCA(n_components=4) ]) ]) p.set_hyperparams_space({ 'step1__multiply_by': RandInt(42, 50), 'step2__multiply_by': RandInt(-10, 0), 'Pipeline__PCA__n_components': RandInt(2, 3) }) samples = p.get_hyperparams_space().rvs() p.set_hyperparams(samples) samples = p.get_hyperparams().to_flat_as_dict_primitive() assert 42 <= samples['step1__multiply_by'] <= 50 assert -10 <= samples['step2__multiply_by'] <= 0 assert samples['Pipeline__PCA__n_components'] in [2, 3] assert p['Pipeline']['PCA'].get_wrapped_sklearn_predictor().n_components in [2, 3]
def test_parallel_queued_parallelize_correctly(): sleep_time = 0.001 p = SequentialQueuedPipeline([ ('1', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('2', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('3', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])), ('4', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])) ], batch_size=10) a = time.time() outputs_streaming = p.transform(list(range(100))) b = time.time() time_queued_pipeline = b - a p = Pipeline([ Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]), Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]) ]) a = time.time() outputs_vanilla = p.transform(list(range(100))) b = time.time() time_vanilla_pipeline = b - a assert time_queued_pipeline < time_vanilla_pipeline assert np.array_equal(outputs_streaming, outputs_vanilla)
def test_localassert_should_assert_dependencies_properly_at_exec(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) p = Pipeline([ RegisterServiceDynamically(), SomeStep().assert_has_services_at_execution(SomeBaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) service = context.get_service(SomeBaseService) assert np.array_equal(service.data, data_inputs)
def test_wrapped_queued_pipeline_with_n_workers_step(): p = Pipeline([ SequentialQueuedPipeline([(1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2))], batch_size=10, max_queue_size=5) ]) outputs = p.transform(list(range(100))) assert np.array_equal(outputs, EXPECTED_OUTPUTS)
def test_with_context_should_inject_dependencies_properly(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) service = SomeService() context.set_service_locator({BaseService: service}) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) assert np.array_equal(service.data, data_inputs)
def test_output_transformer_should_zip_data_input_and_expected_output_in_the_transformed_output( tmpdir: LocalPath): pipeline = Pipeline([MultiplyBy2OutputTransformer()]) pipeline, new_data_container = pipeline.handle_fit_transform( DataContainer(data_inputs=[1, 2, 3], current_ids=[0, 1, 2], expected_outputs=[2, 3, 4]), ExecutionContext(tmpdir)) assert new_data_container.data_inputs == [2, 4, 6] assert new_data_container.expected_outputs == [4, 6, 8]
def main(): p = Pipeline([MultiplyByN(2), MultiplyByN(4)]) outputs = p.transform(list(range(10))) print('transform: {}'.format(outputs)) p = p.mutate(new_method='inverse_transform', method_to_assign_to='transform') outputs = p.transform(list(range(10))) print('inverse_transform: {}'.format(outputs))