def test_build_pipeline_predict_arguments_assertion(): test_df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) @fp.curry def invalid_learner(df): def p(dataset, *a, **b): return dataset + len(a) + len(b) return p, df, {} with pytest.raises(AssertionError): build_pipeline(invalid_learner)(test_df)
def test_build_pipeline_learner_assertion(): @fp.curry def learner(df, a, b, c=3): return lambda dataset: dataset + a + b + c, df, {} learner_fn = learner(b=2) with pytest.raises(AssertionError): build_pipeline(learner_fn) learner_fn = learner(a=1, b=2) build_pipeline(learner_fn)
def test_build_pipeline_learner_assertion(has_repeated_learners): @fp.curry def learner(df, a, b, c=3): return lambda dataset: dataset + a + b + c, df, {} learner_fn = learner(b=2) with pytest.raises(ValueError): build_pipeline(learner_fn, has_repeated_learners=has_repeated_learners) learner_fn = learner(a=1, b=2) build_pipeline(learner_fn)
def test_build_pipeline_idempotency(): test_df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) orig_df = test_df.copy() mult_constant = 2 expected_df = pd.DataFrame({ "x": np.array([1, 2, 3, 4, 5]) * mult_constant, "y": [2, 4, 6, 8, 10] }) def kwargs_learner(df): def p(dataset, mult): return dataset.assign(x=dataset.x * mult) return p, p(df, mult_constant), { "kwargs_learner": { "mult_constant": mult_constant } } def dummy_learner(df): return lambda dataset: dataset, df, {"dummy_learner": {"dummy": {}}} for variation in itertools.permutations( [dummy_learner, kwargs_learner, dummy_learner]): side_effect_pipeline = build_pipeline(*variation) predict_fn, result_df, log = side_effect_pipeline(test_df) pd.util.testing.assert_frame_equal(test_df, orig_df) pd.util.testing.assert_frame_equal(result_df, expected_df) pd.util.testing.assert_frame_equal( predict_fn(test_df, mult=mult_constant), expected_df)
def training_pipeline(text_cols, target_column, vectorizer_params, logistic_params): return log_learner_time( build_pipeline( nlp_logistic_classification_learner( text_feature_cols=text_cols, target=target_column, vectorizer_params=vectorizer_params, logistic_params=logistic_params ) ), "tweet_sentiment_analysis")
def test_build_pipeline_serialisation(): df_train = pd.DataFrame({'id': ["id1"], 'x1': [10.0], 'y': [2.3]}) fn = lambda x: x @fp.curry def dummy_learner(df, fn, call): return fn, df, {f"dummy_learner_{call}": {}} @fp.curry def dummy_learner_2(df, fn, call): return dummy_learner(df, fn, call) @fp.curry def dummy_learner_3(df, fn, call): return fn, df, {f"dummy_learner_{call}": {}, "obj": "a"} train_fn = build_pipeline(dummy_learner(fn=fn, call=1), dummy_learner_2(fn=fn, call=2), dummy_learner_3(fn=fn, call=3)) predict_fn, pred_train, log = train_fn(df_train) fkml = { "pipeline": ["dummy_learner", "dummy_learner_2", "dummy_learner_3"], "output_columns": ['id', 'x1', 'y'], "features": ['id', 'x1', 'y'], "learners": { "dummy_learner": { "fn": fn, "log": { "dummy_learner_1": {} } }, "dummy_learner_2": { "fn": fn, "log": { "dummy_learner_2": {} } }, "dummy_learner_3": { "fn": fn, "log": { "dummy_learner_3": {} }, "obj": "a" } } } assert log["__fkml__"] == fkml assert "obj" not in log.keys()
def test_build_pipeline(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline(placeholder_imputer(columns_to_impute=features, placeholder_value=-999), count_categorizer(columns_to_categorize=["cat"]), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test_with_shap = predict_fn(df_test, apply_shap=True) assert set(pred_test_with_shap.columns) - set(pred_train.columns) == { "shap_values", "shap_expected_value" } pred_test_without_shap = predict_fn(df_test) assert set(pred_test_without_shap.columns) == set(pred_train.columns) pd.util.testing.assert_frame_equal( pred_test_with_shap[pred_test_without_shap.columns], pred_test_without_shap)
def test_build_pipeline_with_onehotencoder(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline( placeholder_imputer(columns_to_impute=["x1", "x2"], placeholder_value=-999), onehot_categorizer(columns_to_categorize=["cat"], hardcode_nans=True), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test = predict_fn(df_test) expected_feature_columns_after_encoding = [ "x1", "x2", "fklearn_feat__cat==c1", "fklearn_feat__cat==c2", "fklearn_feat__cat==c4", "fklearn_feat__cat==nan" ] assert set( pred_test.columns) == set(expected_feature_columns_after_encoding + ["id", target, "prediction"])
def test_build_pipeline_no_side_effects(): test_df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) orig_df = test_df.copy() def side_effect_learner(df): df['side_effect1'] = df['x'] * 2 return lambda dataset: dataset, df, {} def kwargs_learner(df): df['side_effect2'] = df['y'] * 2 def p(dataset, mult=2): return dataset.assign(x=dataset.x * mult) return p, p(df), {} side_effect_pipeline = build_pipeline(side_effect_learner, kwargs_learner) side_effect_pipeline(test_df) pd.util.testing.assert_frame_equal(test_df, orig_df)
df.columns = ["income"] df["bill_amount"] = data_bill_amount * 10000 df["income"] = df["income"].apply(lambda x: x * 1000) print(f"turned our test data into an income dataframe...\n {df.head()}") # ---------------------------------------------------------------------------------------------------------------------- # Get to the actual work. from fklearn.training.regression import linear_regression_learner from fklearn.training.transformation import capper, floorer, prediction_ranger # initialize several learner functions # 1. one function to cap the input data to ignore outliers. # 2. then a usual regression # 3. third again we'd min/max the output of the regression capper_fn = capper(columns_to_cap=["income"], precomputed_caps={"income": 500}) regression_fn = linear_regression_learner(features=["income"], target="bill_amount") ranger_fn = prediction_ranger(prediction_min=0.0, prediction_max=200.0) # apply two by currieing them together... from fklearn.training.pipeline import build_pipeline learner = build_pipeline(capper_fn, regression_fn, ranger_fn) p, df, log = learner(df) print( f" the returned dataframe now contains our capped prediction:\n {df.head(5)}" )