コード例 #1
0
def test_encode_into_top_categories_plus_drop_binary(df_enc_binary):

    # top_categories = 1
    encoder = OneHotEncoder(top_categories=1,
                            variables=None,
                            drop_last=False,
                            drop_last_binary=True)
    X = encoder.fit_transform(df_enc_binary)

    # test fit attr
    transf = {
        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        "var_A_B":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_C_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    }

    transf = pd.DataFrame(transf).astype("int32")
    X = pd.DataFrame(X).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 4
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
    assert "var_C_B" not in X.columns

    # top_categories = 2
    encoder = OneHotEncoder(top_categories=2,
                            variables=None,
                            drop_last=False,
                            drop_last_binary=True)
    X = encoder.fit_transform(df_enc_binary)

    # test fit attr
    transf = {
        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        "var_A_B":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_A_A":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_B":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_C_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    }

    transf = pd.DataFrame(transf).astype("int32")
    X = pd.DataFrame(X).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 4
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
    assert "var_C_B" not in X.columns
コード例 #2
0
def test_encode_top_categories(df_enc_big):
    # test case 3: encode only the most popular categories
    encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False)
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories == 4
    # test fit attr
    transf = {
        "var_A_D": 10,
        "var_A_B": 10,
        "var_A_A": 6,
        "var_A_G": 6,
        "var_B_A": 10,
        "var_B_D": 10,
        "var_B_G": 6,
        "var_B_B": 6,
        "var_C_D": 10,
        "var_C_C": 10,
        "var_C_G": 6,
        "var_C_B": 6,
    }

    assert encoder.n_features_in_ == 3
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_F" not in X.columns
コード例 #3
0
def test_encode_numerical_variables(df_enc_numeric):
    encoder = OneHotEncoder(
        top_categories=None,
        variables=None,
        drop_last=False,
        ignore_format=True,
    )

    X = encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]])

    # test fit attr
    transf = {
        "var_A_1":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_A_2":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_A_3":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_B_1":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_2":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_3":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    }

    transf = pd.DataFrame(transf).astype("int32")
    X = pd.DataFrame(X).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
コード例 #4
0
def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big):
    # test case 2: encode all categories into k-1 binary variables,
    # pass list of variables
    encoder = OneHotEncoder(
        top_categories=None, variables=["var_A", "var_B"], drop_last=True
    )
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories is None
    assert encoder.variables == ["var_A", "var_B"]
    assert encoder.drop_last is True
    # test fit attr
    transf = {
        "var_A_A": 6,
        "var_A_B": 10,
        "var_A_C": 4,
        "var_A_D": 10,
        "var_A_E": 2,
        "var_A_F": 2,
        "var_B_A": 10,
        "var_B_B": 6,
        "var_B_C": 4,
        "var_B_D": 10,
        "var_B_E": 2,
        "var_B_F": 2,
    }
    assert encoder.input_shape_ == (40, 3)
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_G" not in X.columns
    assert "var_C" in X.columns
コード例 #5
0
def test_encode_categories_in_k_binary_plus_select_vars_automatically(
        df_enc_big):
    # test case 1: encode all categories into k binary variables, select variables
    # automatically
    encoder = OneHotEncoder(top_categories=None,
                            variables=None,
                            drop_last=False)
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories is None
    assert encoder.variables is None
    assert encoder.drop_last is False
    # test fit attr
    transf = {
        "var_A_A": 6,
        "var_A_B": 10,
        "var_A_C": 4,
        "var_A_D": 10,
        "var_A_E": 2,
        "var_A_F": 2,
        "var_A_G": 6,
        "var_B_A": 10,
        "var_B_B": 6,
        "var_B_C": 4,
        "var_B_D": 10,
        "var_B_E": 2,
        "var_B_F": 2,
        "var_B_G": 6,
        "var_C_A": 4,
        "var_C_B": 6,
        "var_C_C": 10,
        "var_C_D": 10,
        "var_C_E": 2,
        "var_C_F": 2,
        "var_C_G": 6,
    }

    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.variables_binary_ == []
    assert encoder.n_features_in_ == 3
    assert encoder.encoder_dict_ == {
        "var_A": ["A", "B", "C", "D", "E", "F", "G"],
        "var_B": ["A", "B", "C", "D", "E", "F", "G"],
        "var_C": ["A", "B", "C", "D", "E", "F", "G"],
    }
    # test transform output
    assert X.sum().to_dict() == transf
    assert "var_A" not in X.columns
コード例 #6
0
def test_encode_top_categories():
    # test case 3: encode only the most popular categories

    df = pd.DataFrame({
        "var_A": ["A"] * 5 + ["B"] * 11 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 7,
        "var_B": ["A"] * 11 + ["B"] * 7 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 5,
        "var_C": ["A"] * 4 + ["B"] * 5 + ["C"] * 11 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 7,
    })

    encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False)
    X = encoder.fit_transform(df)

    # test init params
    assert encoder.top_categories == 4
    # test fit attr
    transf = {
        "var_A_D": 9,
        "var_A_B": 11,
        "var_A_A": 5,
        "var_A_G": 7,
        "var_B_A": 11,
        "var_B_D": 9,
        "var_B_G": 5,
        "var_B_B": 7,
        "var_C_D": 9,
        "var_C_C": 11,
        "var_C_G": 7,
        "var_C_B": 5,
    }

    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.variables_binary_ == []
    assert encoder.n_features_in_ == 3
    assert encoder.encoder_dict_ == {
        "var_A": ["B", "D", "G", "A"],
        "var_B": ["A", "D", "B", "G"],
        "var_C": ["C", "D", "G", "B"],
    }
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_F" not in X.columns
コード例 #7
0
def test_encode_into_k_dummy_plus_drop_binary(df_enc_binary):
    encoder = OneHotEncoder(top_categories=None,
                            variables=None,
                            drop_last=False,
                            drop_last_binary=True)
    X = encoder.fit_transform(df_enc_binary)
    X = X.astype("int32")

    # test fit attr
    transf = {
        "var_num":
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        "var_A_A":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_A_B":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_A_C":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_B_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_B":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_C":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_C_AHA":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_D_OHO":
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    }
    transf = pd.DataFrame(transf).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
    assert encoder.variables_binary_ == ["var_C", "var_D"]
    assert encoder.n_features_in_ == 5
    assert encoder.encoder_dict_ == {
        "var_A": ["A", "B", "C"],
        "var_B": ["A", "B", "C"],
        "var_C": ["AHA"],
        "var_D": ["OHO"],
    }
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
    assert "var_C_B" not in X.columns