def test_onehot_categorizer(
        df_train, df_test, columns_to_categorize, drop_first, hardcode, expected_output_train, expected_output_test
):

    categorizer_learner = onehot_categorizer(
        columns_to_categorize=columns_to_categorize, hardcode_nans=hardcode, drop_first_column=drop_first)

    pred_fn, data, log = categorizer_learner(df_train)
    test_result = pred_fn(df_test)

    assert_frame_equal(test_result, expected_output_test)
    assert_frame_equal(data, expected_output_train)
Beispiel #2
0
def test_build_pipeline_with_onehotencoder(has_repeated_learners):
    df_train = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0],
        "x2": [0, 1, 1, 0, 1, 0],
        "cat": ["c1", "c1", "c2", None, "c2", "c4"],
        'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9]
    })

    df_test = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
        "x2": [1, 1, 0, None, 0, 1],
        "cat": ["c1", "c2", "c5", None, "c2", "c3"],
        'y': [1.3, -4.0, 0.0, 49, 0.0, 49]
    })

    features = ["x1", "x2", "cat"]
    target = "y"

    train_fn = build_pipeline(
        placeholder_imputer(columns_to_impute=["x1", "x2"],
                            placeholder_value=-999),
        onehot_categorizer(columns_to_categorize=["cat"], hardcode_nans=True),
        xgb_regression_learner(features=features,
                               target=target,
                               num_estimators=20,
                               extra_params={"seed": 42}),
        has_repeated_learners=has_repeated_learners)

    predict_fn, pred_train, log = train_fn(df_train)

    pred_test = predict_fn(df_test)

    expected_feature_columns_after_encoding = [
        "x1", "x2", "fklearn_feat__cat==c1", "fklearn_feat__cat==c2",
        "fklearn_feat__cat==c4", "fklearn_feat__cat==nan"
    ]

    assert set(
        pred_test.columns) == set(expected_feature_columns_after_encoding +
                                  ["id", target, "prediction"])
Beispiel #3
0
def test_onehot_categorizer():
    input_df_train = pd.DataFrame({
        "feat1_num": [1, 0.5, nan, 100],
        "sex": ["female", "male", "male", "male"],
        "region": ["SP", "RG", "MG", nan]
    })

    expected_output_train_no_hardcode = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [1, 0.5, nan,
                            100]), ("fklearn_feat__sex==female", [1, 0, 0, 0]),
             ("fklearn_feat__sex==male",
              [0, 1, 1, 1]), ("fklearn_feat__region==MG", [0, 0, 1, 0]),
             ("fklearn_feat__region==RG",
              [0, 1, 0, 0]), ("fklearn_feat__region==SP", [1, 0, 0, 0]))))

    expected_output_train_hardcode = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [1, 0.5, nan,
                            100]), ("fklearn_feat__sex==female", [1, 0, 0, 0]),
             ("fklearn_feat__sex==male",
              [0, 1, 1, 1]), ("fklearn_feat__sex==nan", [0, 0, 0, 0]),
             ("fklearn_feat__region==MG",
              [0, 0, 1, 0]), ("fklearn_feat__region==RG", [0, 1, 0, 0]),
             ("fklearn_feat__region==SP",
              [1, 0, 0, 0]), ("fklearn_feat__region==nan", [0, 0, 0, 1]))))

    expected_output_train_drop_first = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [1, 0.5, nan,
                            100]), ("fklearn_feat__sex==male", [0, 1, 1, 1]),
             ("fklearn_feat__region==RG",
              [0, 1, 0, 0]), ("fklearn_feat__region==SP", [1, 0, 0, 0]))))

    input_df_test = pd.DataFrame({
        "feat1_num": [2, 20, 200, 2000],
        "sex": ["male", "female", "male", "nonbinary"],
        "region": [nan, nan, "SP", "RG"]
    })

    expected_output_test_no_hardcode = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [2, 20, 200,
                            2000]), ("fklearn_feat__sex==female", [0, 1, 0,
                                                                   0]),
             ("fklearn_feat__sex==male",
              [1, 0, 1, 0]), ("fklearn_feat__region==MG", [0, 0, 0, 0]),
             ("fklearn_feat__region==RG",
              [0, 0, 0, 1]), ("fklearn_feat__region==SP", [0, 0, 1, 0]))))

    expected_output_test_hardcode = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [2, 20, 200,
                            2000]), ("fklearn_feat__sex==female", [0, 1, 0,
                                                                   0]),
             ("fklearn_feat__sex==male",
              [1, 0, 1, 0]), ("fklearn_feat__sex==nan", [0, 0, 0, 1]),
             ("fklearn_feat__region==MG",
              [0, 0, 0, 0]), ("fklearn_feat__region==RG", [0, 0, 0, 1]),
             ("fklearn_feat__region==SP",
              [0, 0, 1, 0]), ("fklearn_feat__region==nan", [1, 1, 0, 0]))))

    expected_output_test_drop_first = pd.DataFrame(
        OrderedDict(
            (("feat1_num", [2, 20, 200,
                            2000]), ("fklearn_feat__sex==male", [1, 0, 1, 0]),
             ("fklearn_feat__region==RG",
              [0, 0, 0, 1]), ("fklearn_feat__region==SP", [0, 0, 1, 0]))))

    # Test without hardcoding NaNs
    categorizer_learner = onehot_categorizer(
        columns_to_categorize=["sex", "region"], hardcode_nans=False)

    pred_fn, data, log = categorizer_learner(input_df_train)

    test_result = pred_fn(input_df_test)

    assert (test_result[expected_output_test_no_hardcode.
                        columns].  # we don't care about output order
            equals(expected_output_test_no_hardcode))

    assert (data[expected_output_train_no_hardcode.
                 columns].  # we don't care about output order
            equals(expected_output_train_no_hardcode))

    # Test with hardcoding NaNs
    categorizer_learner = onehot_categorizer(
        columns_to_categorize=["sex", "region"], hardcode_nans=True)

    pred_fn, data, log = categorizer_learner(input_df_train)

    test_result = pred_fn(input_df_test)

    assert (test_result[expected_output_test_hardcode.
                        columns].  # we don't care about output order
            equals(expected_output_test_hardcode))

    assert (data[expected_output_train_hardcode.
                 columns].  # we don't care about output order
            equals(expected_output_train_hardcode))

    # Testing dropping the first column
    categorizer_learner = onehot_categorizer(
        columns_to_categorize=["sex", "region"],
        hardcode_nans=False,
        drop_first_column=True)

    pred_fn, data, log = categorizer_learner(input_df_train)

    test_result = pred_fn(input_df_test)

    assert (test_result[expected_output_test_drop_first.columns].equals(
        expected_output_test_drop_first))
    assert (data[expected_output_train_drop_first.columns].equals(
        expected_output_train_drop_first))