def test_pipeline_nested_mutate_inverse_transform(): expected_tape = ["1", "2", "3", "4", "5", "6", "7", "7", "6", "5", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), TransformCallbackStep(tape.callback, ["5"]), Identity() ]), TransformCallbackStep(tape.callback, ["6"]), TransformCallbackStep(tape.callback, ["7"]), Identity() ]) p, _ = p.fit_transform(np.ones((1, 1))) # will add range(1, 8) to tape. print("[mutating]") p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform") p.transform(np.ones((1, 1))) # will add reversed(range(1, 8)) to tape. print(expected_tape) print(tape.get_name_tape()) assert expected_tape == tape.get_name_tape()
def test_forcehandleidentity_does_not_crash(tmpdir): p = Pipeline([ ForceHandleIdentity() ]) data_inputs = np.array([0, 1, 2, 3]) expected_outputs = data_inputs * 2 p.fit(data_inputs, expected_outputs) p.fit_transform(data_inputs, expected_outputs) p.transform(data_inputs=data_inputs)
def test_add_service_assertions_should_fail_when_services_are_missing(tmpdir): with pytest.raises(AssertionError) as exception_info: context = ExecutionContext(root=tmpdir) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) data_inputs = np.array([0, 1, 2, 3]) p.transform(data_inputs=data_inputs) assert 'BaseService dependency missing' in exception_info.value.args[0]
def test_with_context_should_inject_dependencies_properly(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) service = SomeService() context.set_service_locator({BaseService: service}) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) assert np.array_equal(service.data, data_inputs)
def main(): p = Pipeline([MultiplyByN(2), MultiplyByN(4)]) outputs = p.transform(list(range(10))) print('transform: {}'.format(outputs)) p = p.mutate(new_method='inverse_transform', method_to_assign_to='transform') outputs = p.transform(list(range(10))) print('inverse_transform: {}'.format(outputs))
def test_localassert_should_assert_dependencies_properly_at_exec(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) p = Pipeline([ RegisterServiceDynamically(), SomeStep().assert_has_services_at_execution(SomeBaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) service = context.get_service(SomeBaseService) assert np.array_equal(service.data, data_inputs)
def main(): p = Pipeline([ ForceAlwaysAlwaysHandleMixinStep(), ]) p = p.fit(np.array([0, 1]), np.array([0, 1])) p = p.transform(np.array([0, 1]))
def test_model_stacking_transform(): model_stacking = Pipeline([ ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) expected_outputs_shape = (379, 1) data_inputs_shape = (379, 13) data_inputs = _create_data(data_inputs_shape) expected_outputs = _create_data(expected_outputs_shape) model_stacking = model_stacking.fit(data_inputs, expected_outputs) outputs = model_stacking.transform(data_inputs) assert outputs.shape == expected_outputs_shape
def main(): p = Pipeline([MultiplyByN(multiply_by=2)]) data_inputs = np.array([1, 2]) generated_outputs = p.transform(data_inputs) regenerated_inputs = p.inverse_transform(generated_outputs) assert np.array_equal(regenerated_inputs, data_inputs) assert np.array_equal(generated_outputs, 2 * data_inputs)
def test_pipeline_fit_then_transform(steps_list, pipeline_runner): data_input_ = [AN_INPUT] expected_output_ = [AN_EXPECTED_OUTPUT] p = Pipeline(steps_list, pipeline_runner=pipeline_runner()) p = p.fit(data_input_, expected_output_) result = p.transform(data_input_) assert tuple(result) == tuple(expected_output_)
def main(): p = Pipeline([ NonFittableStep(), NonTransformableStep(), Identity() # Note: Identity does nothing: it inherits from both NonFittableMixin and NonTransformableMixin. ]) p = p.fit(np.array([0, 1]), np.array([0, 1])) out = p.transform(np.array([0, 1]))
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) p = Pipeline([ NumpyShapePrinter(), AddFeatures([ PCA(n_components=2), FastICA(n_components=2), ]), NumpyShapePrinter(), RidgeModelStacking([ GradientBoostingRegressor(), GradientBoostingRegressor(n_estimators=500), GradientBoostingRegressor(max_depth=5), KMeans(), ]), NumpyShapePrinter(), ]) print("Fitting on train:") p = p.fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.transform(X_train) y_test_predicted = p.transform(X_test) print("") print("Evaluating transformed train:") score_train = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_train) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test) assert y_train_predicted.shape == (379,) assert y_test_predicted.shape == (127,) assert isinstance(score_train, float) assert isinstance(score_test, float) return y_train_predicted, y_test_predicted, score_train, score_test
def test_wrapped_queued_pipeline_with_n_workers_step(): p = Pipeline([ SequentialQueuedPipeline([(1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2)), (1, MultiplyByN(2))], batch_size=10, max_queue_size=5) ]) outputs = p.transform(list(range(100))) assert np.array_equal(outputs, EXPECTED_OUTPUTS)
def test_pipeline_simple_mutate_inverse_transform(): expected_tape = ["1", "2", "3", "4", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), Identity() ]) p, _ = p.fit_transform(np.ones((1, 1))) print("[mutating]") p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform") p.transform(np.ones((1, 1))) assert expected_tape == tape.get_name_tape()
def test_feature_union_should_transform_with_numpy_transpose(): p = Pipeline( [FeatureUnion([ Identity(), Identity(), ], joiner=NumpyTranspose())]) data_inputs = np.random.randint((1, 20)) outputs = p.transform(data_inputs) assert np.array_equal(outputs, np.array([data_inputs, data_inputs]).transpose())
def test_should_transform_each_steps(steps: List[BaseStep], expected_tape: List[str]): pipeline = Pipeline(steps=steps) pipeline = pipeline.fit(data_inputs) tape.data = [] tape.name_tape = [] actual_data_inputs = pipeline.transform(data_inputs) actual_tape = tape.get_name_tape() assert actual_tape == expected_tape assert np.array_equal(actual_data_inputs, data_inputs)
def test_feature_union_should_transform_with_zip_features(): p = Pipeline( [FeatureUnion([ Identity(), Identity(), ], joiner=ZipFeatures())]) data_inputs = np.random.randint(low=0, high=100, size=(2, 20)) outputs = p.transform(data_inputs) assert np.array_equal(outputs, np.stack([data_inputs, data_inputs], axis=1))
def test_feature_union_should_transform_with_concatenate_inner_features(): p = Pipeline([ FeatureUnion([ Identity(), Identity(), ], joiner=NumpyConcatenateInnerFeatures()) ]) data_inputs = np.random.randint((1, 20)) outputs = p.transform(data_inputs) assert np.array_equal(outputs, np.concatenate([data_inputs, data_inputs]))
def test_transform_should_transform_all_steps_for_each_data_inputs_expected_outputs(): tape = TapeCallbackFunction() p = Pipeline([ ForEachDataInput(Pipeline([ TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), ])) ]) data_inputs = [[0, 1], [1, 2]] outputs = p.transform(data_inputs) assert tape.get_name_tape() == ["1", "2", "1", "2"]
def main(): value_caching_folder = 'value_caching' if not os.path.exists(value_caching_folder): os.makedirs(value_caching_folder) data_inputs = list(range(100)) sleep_time = 0.001 a = time.time() for i in range(5): p = Pipeline([ PickleValueCachingWrapper(ForEach( Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), cache_folder=value_caching_folder) ]) outputs_value_caching = p.transform(data_inputs) b = time.time() time_value_caching_pipeline = b - a print('Pipeline with ValueCachingWrapper') print('execution time: {} seconds'.format(time_value_caching_pipeline)) a = time.time() for i in range(5): p = Pipeline([ ForEach(Pipeline([Sleep(sleep_time=sleep_time), MultiplyByN(2)])), ]) outputs_vanilla = p.transform(data_inputs) b = time.time() time_vanilla_pipeline = b - a print('Pipeline without value caching') print('execution time: {} seconds'.format(time_vanilla_pipeline)) shutil.rmtree(value_caching_folder) assert np.array_equal(outputs_value_caching, outputs_vanilla) assert time_value_caching_pipeline < time_vanilla_pipeline
def test_predict_should_transform_with_initial_is_train_mode_after_predict(): tape_fit = TapeCallbackFunction() tape_transform = TapeCallbackFunction() p = Pipeline([ TestOnlyWrapper( CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)), TrainOnlyWrapper( CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit)) ]) p.predict(np.array([1, 1])) outputs = p.transform(np.array([1, 1])) assert np.array_equal(outputs, np.array([4, 4]))
def test_choose_one_step_of_set_hyperparams(method_name, args, kwargs): a_callback = TapeCallbackFunction() b_callback = TapeCallbackFunction() c_callback = TapeCallbackFunction() d_callback = TapeCallbackFunction() choose_one_step_of = ChooseOneStepOf([ ('a', FitTransformCallbackStep( a_callback, c_callback, transform_function=lambda di: di * 2).set_name("step_1")), ('b', FitTransformCallbackStep( b_callback, d_callback, transform_function=lambda di: di * 2).set_name("step_1")) ]) p = Pipeline([choose_one_step_of]) p.transform(DATA_INPUTS) assert len(a_callback.data) == 1 assert all(a_callback.data[0] == DATA_INPUTS) assert len(b_callback.data) == 0 assert len(c_callback.data) == 0 assert len(d_callback.data) == 0 getattr(choose_one_step_of, method_name)(*args, **kwargs) p.transform(DATA_INPUTS) assert len(a_callback.data) == 1 assert all(a_callback.data[0] == DATA_INPUTS) assert len(b_callback.data) == 1 assert all(b_callback.data[0] == DATA_INPUTS) assert len(c_callback.data) == 0 assert len(d_callback.data) == 0
def test_choose_one_step_of_update_hyperparams(): a_callback = TapeCallbackFunction() b_callback = TapeCallbackFunction() c_callback = TapeCallbackFunction() d_callback = TapeCallbackFunction() choose_one_step_of = ChooseOneStepOf([ ('a', FitTransformCallbackStep( a_callback, c_callback, transform_function=lambda di: di * 2).set_name("step_1")), ('b', FitTransformCallbackStep( b_callback, d_callback, transform_function=lambda di: di * 2).set_name("step_1")) ]) p = Pipeline([choose_one_step_of]) p.transform(DATA_INPUTS) assert len(a_callback.data) == 1 assert all(a_callback.data[0] == DATA_INPUTS) assert len(b_callback.data) == 0 assert len(c_callback.data) == 0 assert len(d_callback.data) == 0 choose_one_step_of.update_hyperparams({'choice': 'b'}) p.transform(DATA_INPUTS) assert len(a_callback.data) == 1 assert all(a_callback.data[0] == DATA_INPUTS) assert len(b_callback.data) == 1 assert all(b_callback.data[0] == DATA_INPUTS) assert len(c_callback.data) == 0 assert len(d_callback.data) == 0
def test_transform_should_use_cache(tmpdir): tape_transform = TapeCallbackFunction() tape_fit = TapeCallbackFunction() p = Pipeline([ JoblibValueCachingWrapper( LogFitTransformCallbackStep(tape_transform, tape_fit, transform_function=np.log), tmpdir) ]) outputs = p.transform([1, 1, 2, 2]) assert outputs == EXPECTED_OUTPUTS assert tape_transform.data == [[1], [2]] assert tape_fit.data == []
def main(): p = Pipeline([ NonFittableStep(), NonTransformableStep(), Identity() # Note: Identity does nothing: it inherits from both NonFittableMixin and NonTransformableMixin. ]) some_data = np.array([0, 1]) p = p.fit(some_data) # Out: # NonFittableStep: I transformed. # NonTransformableStep: I fitted. out = p.transform(some_data) # Out: # NonFittableStep: I transformed. assert np.array_equal(out, some_data)
def test_expand_dim_transform(): handle_fit_callback = TapeCallbackFunction() handle_transform_callback = TapeCallbackFunction() handle_fit_transform_callback = TapeCallbackFunction() p = Pipeline([ ExpandDim( HandleCallbackStep(handle_fit_callback, handle_transform_callback, handle_fit_transform_callback)) ]) p['ExpandDim'].hashers = [SomeSummaryHasher(fake_summary_id=SUMMARY_ID)] outputs = p.transform(np.array(range(10))) assert np.array_equal(outputs, np.array(range(10))) assert handle_fit_callback.data == [] assert handle_transform_callback.data[0][0].current_ids == [SUMMARY_ID] assert np.array_equal( np.array(handle_transform_callback.data[0][0].data_inputs), np.array([np.array(range(10))])) assert np.array_equal( np.array(handle_transform_callback.data[0][0].expected_outputs), np.array([[None] * 10])) assert handle_fit_transform_callback.data == []
def test_expand_dim_transform(): handle_fit_callback = TapeCallbackFunction() handle_transform_callback = TapeCallbackFunction() handle_fit_transform_callback = TapeCallbackFunction() p = Pipeline([ ExpandDim( HandleCallbackStep(handle_fit_callback, handle_transform_callback, handle_fit_transform_callback)) ]) outputs = p.transform(np.array(range(10))) assert np.array_equal(outputs, np.array(range(10))) assert handle_fit_callback.data == [] assert handle_transform_callback.data[0][0].current_ids == [ '781e5e245d69b566979b86e28d23f2c7' ] assert np.array_equal( np.array(handle_transform_callback.data[0][0].data_inputs), np.array([np.array(range(10))])) assert np.array_equal( np.array(handle_transform_callback.data[0][0].expected_outputs), np.array([[None] * 10])) assert handle_fit_transform_callback.data == []
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) print("Meta-fitting on train:") p = p.meta_fit(X_train, y_train, metastep=RandomSearch( n_iter=10, higher_score_is_better=True, validation_technique=KFoldCrossValidation( scoring_function=r2_score, k_fold=10))) # Here is an alternative way to do it, more "pipeliney": # p = RandomSearch( # p, # n_iter=15, # higher_score_is_better=True, # validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3) # ).fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.transform(X_train) y_test_predicted = p.transform(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)
print("Meta-fitting on train:") p = p.meta_fit(X_train, y_train, metastep=RandomSearch(n_iter=10, higher_score_is_better=True, validation_technique=KFoldCrossValidation( scoring_function=r2_score, k_fold=10))) # Here is an alternative way to do it, more "pipeliney": # p = RandomSearch( # n_iter=15, # higher_score_is_better=True, # validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3) # ).set_step(p).fit(X_train, y_train).get_best_model() print("") print("Transforming train and test:") y_train_predicted = p.transform(X_train) y_test_predicted = p.transform(X_test) print("") print("Evaluating transformed train:") score = r2_score(y_train_predicted, y_train) print('R2 regression score:', score) print("") print("Evaluating transformed test:") score = r2_score(y_test_predicted, y_test) print('R2 regression score:', score)
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) pipeline = Pipeline([ AddFeatures([ PCA(n_components=2), FastICA(n_components=2), ]), RidgeModelStacking([ GradientBoostingRegressor(), KMeans(), ]), ]) print("Fitting on train:") pipeline = pipeline.fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = pipeline.transform(X_train) y_test_predicted = pipeline.transform(X_test) print("") print("Evaluating transformed train:") score = r2_score(y_train_predicted, y_train) print('R2 regression score:', score) print("") print("Evaluating transformed test:") score = r2_score(y_test_predicted, y_test) print('R2 regression score:', score) print("Deploying the application by routing data to the transform method:") class CustomJSONDecoderFor2DArray(JSONDataBodyDecoder): """This is a custom JSON decoder class that precedes the pipeline's transformation.""" def decode(self, data_inputs): """ Transform a JSON list object into an np.array object. :param data_inputs: json object :return: np array for data inputs """ return np.array(data_inputs) class CustomJSONEncoderOfOutputs(JSONDataResponseEncoder): """This is a custom JSON response encoder class for converting the pipeline's transformation outputs.""" def encode(self, data_inputs) -> dict: """ Convert predictions to a dict for creating a JSON Response object. :param data_inputs: :return: """ return {'predictions': list(data_inputs)} app = FlaskRestApiWrapper( json_decoder=CustomJSONDecoderFor2DArray(), wrapped=pipeline, json_encoder=CustomJSONEncoderOfOutputs()).get_app() print("Finally, run the app by uncommenting this next line of code:") # app.run(debug=False, port=5000) print("You can now call your pipeline over HTTP with a (JSON) REST API.") # test_predictictions = requests.post( # url='http://127.0.0.1:5000/', # json=X_test.tolist() # ) # print(test_predictictions) # print(test_predictictions.content) assert isinstance(app, Flask) return app