Exemple #1
0
def test_count_categorizer():
    input_df_train = pd.DataFrame({
        "feat1_num": [1, 0.5, nan, 100],
        "feat2_cat": ["a", "a", "a", "b"],
        "feat3_cat": ["c", "c", "c", nan]
    })

    expected_output_train = pd.DataFrame({
        "feat1_num": [1, 0.5, nan, 100],
        "feat2_cat": [3, 3, 3, 1],
        "feat3_cat": [3, 3, 3, nan]
    })

    input_df_test = pd.DataFrame({
        "feat1_num": [2, 20, 200, 2000],
        "feat2_cat": ["a", "b", "b", "d"],
        "feat3_cat": [nan, nan, "c", "c"]
    })

    expected_output_test = pd.DataFrame({
        "feat1_num": [2, 20, 200, 2000],
        "feat2_cat": [3, 1, 1, 1],  # replace unseen vars with constant (1)
        "feat3_cat": [nan, nan, 3, 3]
    })

    categorizer_learner = count_categorizer(
        columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1)

    pred_fn, data, log = categorizer_learner(input_df_train)

    test_result = pred_fn(input_df_test)

    assert data.equals(expected_output_train)
    assert test_result.equals(expected_output_test)
Exemple #2
0
def test_build_pipeline(has_repeated_learners):
    df_train = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0],
        "x2": [0, 1, 1, 0, 1, 0],
        "cat": ["c1", "c1", "c2", None, "c2", "c4"],
        'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9]
    })

    df_test = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
        "x2": [1, 1, 0, None, 0, 1],
        "cat": ["c1", "c2", "c5", None, "c2", "c3"],
        'y': [1.3, -4.0, 0.0, 49, 0.0, 49]
    })

    features = ["x1", "x2", "cat"]
    target = "y"

    train_fn = build_pipeline(placeholder_imputer(columns_to_impute=features,
                                                  placeholder_value=-999),
                              count_categorizer(columns_to_categorize=["cat"]),
                              xgb_regression_learner(features=features,
                                                     target=target,
                                                     num_estimators=20,
                                                     extra_params={"seed":
                                                                   42}),
                              has_repeated_learners=has_repeated_learners)

    predict_fn, pred_train, log = train_fn(df_train)

    pred_test_with_shap = predict_fn(df_test, apply_shap=True)
    assert set(pred_test_with_shap.columns) - set(pred_train.columns) == {
        "shap_values", "shap_expected_value"
    }

    pred_test_without_shap = predict_fn(df_test)
    assert set(pred_test_without_shap.columns) == set(pred_train.columns)

    pd.util.testing.assert_frame_equal(
        pred_test_with_shap[pred_test_without_shap.columns],
        pred_test_without_shap)
def test_count_categorizer():
    input_df_train = pd.DataFrame(
        {
            "feat1_num": [1, 0.5, nan, 100],
            "feat2_cat": ["a", "a", "a", "b"],
            "feat3_cat": ["c", "c", "c", nan],
        }
    )

    expected_output_train = pd.DataFrame(
        {
            "feat1_num": [1, 0.5, nan, 100],
            "feat2_cat": [3, 3, 3, 1],
            "feat3_cat": [3, 3, 3, nan],
        }
    )

    input_df_test = pd.DataFrame(
        {
            "feat1_num": [2, 20, 200, 2000],
            "feat2_cat": ["a", "b", "b", "d"],
            "feat3_cat": [nan, nan, "c", "c"],
        }
    )

    expected_output_test = pd.DataFrame(
        {
            "feat1_num": [2, 20, 200, 2000],
            "feat2_cat": [3, 1, 1, 1],  # replace unseen vars with constant (1)
            "feat3_cat": [nan, nan, 3, 3],
        }
    )

    categorizer_learner1 = count_categorizer(
        columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1
    )
    categorizer_learner2 = count_categorizer(
        columns_to_categorize=["feat2_cat", "feat3_cat"],
        replace_unseen=1,
        suffix="_suffix",
    )
    categorizer_learner3 = count_categorizer(
        columns_to_categorize=["feat2_cat", "feat3_cat"],
        replace_unseen=1,
        prefix="prefix_",
    )
    categorizer_learner4 = count_categorizer(
        columns_to_categorize=["feat2_cat", "feat3_cat"],
        replace_unseen=1,
        columns_mapping={
            "feat2_cat": "feat2_cat_raw",
            "feat3_cat": "feat3_cat_raw",
        },
    )

    pred_fn1, data1, log = categorizer_learner1(input_df_train)
    pred_fn2, data2, log = categorizer_learner2(input_df_train)
    pred_fn3, data3, log = categorizer_learner3(input_df_train)
    pred_fn4, data4, log = categorizer_learner4(input_df_train)

    assert expected_output_train.equals(data1)
    assert expected_output_test.equals(pred_fn1(input_df_test))

    categorized = ["feat2_cat", "feat3_cat"]
    assert pd.concat(
        [
            expected_output_train,
            input_df_train[categorized].copy().add_suffix("_suffix"),
        ],
        axis=1,
    ).equals(data2)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[categorized].copy().add_suffix("_suffix"),
        ],
        axis=1,
    ).equals(pred_fn2(input_df_test))

    assert pd.concat(
        [
            expected_output_train,
            input_df_train[categorized].copy().add_prefix("prefix_"),
        ],
        axis=1,
    ).equals(data3)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[categorized].copy().add_prefix("prefix_"),
        ],
        axis=1,
    ).equals(pred_fn3(input_df_test))

    assert pd.concat(
        [
            expected_output_train,
            input_df_train[categorized].copy().add_suffix("_raw"),
        ],
        axis=1,
    ).equals(data4)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[categorized].copy().add_suffix("_raw"),
        ],
        axis=1,
    ).equals(pred_fn4(input_df_test))