Beispiel #1
0
def test_truncate_categorical():
    input_df_train = pd.DataFrame({
        "col": ["a", "a", "a", "b", "b", "b", "b", "c", "d", "f", nan],
        "y": [1., 0, 1, 1, 1, 0, 1, 0, 1, 0, 1]
    })

    input_df_test = pd.DataFrame({
        "col": ["a", "a", "b", "c", "d", "f", "e", nan],
        "y": [1., 0, 1, 1, 1, 0, 1, 1]
    })

    expected_output_train = pd.DataFrame({
        "col": ["a", "a", "a", "b", "b", "b", "b", -9999, -9999, -9999, nan],
        "y": [1., 0, 1, 1, 1, 0, 1, 0, 1, 0, 1]
    })

    expected_output_test = pd.DataFrame({
        "col": ["a", "a", "b", -9999, -9999, -9999, -9999, nan],
        "y": [1., 0, 1, 1, 1, 0, 1, 1]
    })

    truncate_learner = truncate_categorical(columns_to_truncate=["col"],
                                            percentile=0.1)

    pred_fn, data, log = truncate_learner(input_df_train)
    test_result = pred_fn(input_df_test)

    assert data.equals(expected_output_train)
    assert test_result.equals(expected_output_test)
def test_truncate_categorical():
    input_df_train = pd.DataFrame(
        {
            "col": ["a", "a", "a", "b", "b", "b", "b", "c", "d", "f", nan],
            "y": [1.0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1],
        }
    )

    input_df_test = pd.DataFrame(
        {
            "col": ["a", "a", "b", "c", "d", "f", "e", nan],
            "y": [1.0, 0, 1, 1, 1, 0, 1, 1],
        }
    )

    expected_output_train = pd.DataFrame(
        {
            "col": [
                "a",
                "a",
                "a",
                "b",
                "b",
                "b",
                "b",
                -9999,
                -9999,
                -9999,
                nan,
            ],
            "y": [1.0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1],
        }
    )

    expected_output_test = pd.DataFrame(
        {
            "col": ["a", "a", "b", -9999, -9999, -9999, -9999, nan],
            "y": [1.0, 0, 1, 1, 1, 0, 1, 1],
        }
    )

    truncate_learner1 = truncate_categorical(
        columns_to_truncate=["col"], percentile=0.1
    )
    truncate_learner2 = truncate_categorical(
        columns_to_truncate=["col"], percentile=0.1, suffix="_suffix"
    )
    truncate_learner3 = truncate_categorical(
        columns_to_truncate=["col"], percentile=0.1, prefix="prefix_"
    )
    truncate_learner4 = truncate_categorical(
        columns_to_truncate=["col"],
        percentile=0.1,
        columns_mapping={"col": "col_raw"},
    )

    pred_fn1, data1, log = truncate_learner1(input_df_train)
    pred_fn2, data2, log = truncate_learner2(input_df_train)
    pred_fn3, data3, log = truncate_learner3(input_df_train)
    pred_fn4, data4, log = truncate_learner4(input_df_train)

    assert expected_output_train.equals(data1)
    assert expected_output_test.equals(pred_fn1(input_df_test))

    assert pd.concat(
        [
            expected_output_train,
            input_df_train[["col"]].copy().add_suffix("_suffix"),
        ],
        axis=1,
    ).equals(data2)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[["col"]].copy().add_suffix("_suffix"),
        ],
        axis=1,
    ).equals(pred_fn2(input_df_test))

    assert pd.concat(
        [
            expected_output_train,
            input_df_train[["col"]].copy().add_prefix("prefix_"),
        ],
        axis=1,
    ).equals(data3)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[["col"]].copy().add_prefix("prefix_"),
        ],
        axis=1,
    ).equals(pred_fn3(input_df_test))

    assert pd.concat(
        [
            expected_output_train,
            input_df_train[["col"]].copy().add_suffix("_raw"),
        ],
        axis=1,
    ).equals(data4)
    assert pd.concat(
        [
            expected_output_test,
            input_df_test[["col"]].copy().add_suffix("_raw"),
        ],
        axis=1,
    ).equals(pred_fn4(input_df_test))