예제 #1
0
def test_NumericalEncoder_default_and_null_values():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    df.index = np.arange(len(df))

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df.loc[0:10, "cat_col_1"] = None

    # All modalities are kept, __null__ category is created
    encoder = NumericalEncoder(encoding_type="num",
                               min_modalities_number=2,
                               max_cum_proba=0.8,
                               max_na_percentage=0)

    res = encoder.fit_transform(df)
    assert "__default__" in encoder.model.variable_modality_mapping[
        "cat_col_1"]
    assert "__null__" in encoder.model.variable_modality_mapping["cat_col_1"]

    df["cat_col_1"] = "zzz"  # Never seen value
    res = encoder.transform(df)
    assert res["cat_col_1"].unique(
    )[0] == encoder.model.variable_modality_mapping["cat_col_1"]["__default__"]

    df["cat_col_1"] = None
    res = encoder.transform(df)
    assert res["cat_col_1"].unique(
    )[0] == encoder.model.variable_modality_mapping["cat_col_1"]["__null__"]
예제 #2
0
def test_NumericalEncoder_num():

    ######################
    ### Numerical Mode ###
    ######################

    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    np.random.shuffle(ind)
    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="num")
    encoder.fit(df)
    res = encoder.transform(df)

    assert res.shape == df.shape
    assert (res.index == df.index).all()

    assert encoder.get_feature_names() == encoder.model._feature_names
    assert encoder.get_feature_names() == list(res.columns)

    df2 = df.copy()
    df2.loc[0, "cat_col_1"] = "something-new"
    df2.loc[1, "cat_col_2"] = None  # Something None

    res2 = encoder.transform(df2)
    assert res2.loc[0, "cat_col_1"] == -1
    assert res2.loc[1, "cat_col_2"] == -1

    df_with_none = df.copy()
    df_with_none["cat_col_3"] = df_with_none["cat_col_1"]
    df_with_none.loc[list(range(25)), "cat_col_3"] = None

    encoder2 = NumericalEncoder(encoding_type="num")
    res2 = encoder2.fit_transform(df_with_none)

    assert (df_with_none["cat_col_3"].isnull() == (
        res2["cat_col_3"] == 0)).all()
예제 #3
0
def test_NumericalEncoder_dummy_output_dtype():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="dummy")
    encoder.fit(df)
    res = encoder.transform(df)

    assert (res.dtypes[res.columns.str.startswith("cat_col_")] == "int32"
            ).all()  # check default encoding type = int32
예제 #4
0
def test_NumericalEncoder_num_output_dtype():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    np.random.shuffle(ind)
    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="num")
    encoder.fit(df)
    res = encoder.transform(df)

    assert res.dtypes["cat_col_1"] == "int32"
    assert res.dtypes["cat_col_2"] == "int32"
예제 #5
0
def test_NumericalEncoder_drop_used_unused_columns(drop_used_columns,
                                                   drop_unused_columns,
                                                   columns_to_use):
    # This test will verify the behavior of the encoder regarding the fact to drop or keep the use/unused columns

    df = pd.DataFrame({
        "obj1": ["a", "b", "c", "d"] * 25,
        "obj2": ["AA", "BB"] * 50,
        "num1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 10,
        "num2": [100, 101, 102, 103, 104] * 20,
        "num3": [0.01, 0.02, 0.03, 0.04, 0.05] * 20,
    })

    df1 = df.loc[0:20, ]
    df2 = df.loc[20:]

    # for drop_used_columns, drop_unused_columns, columns_to_use in list(itertools.product((True,False),(True,False),("all","object",["num1","num2","num3"]))):

    resulting_columns = {
        col: ["%s__%s" % (col, str(v)) for v in df[col].value_counts().index]
        for col in df.columns
    }

    if columns_to_use == "all":
        cols = list(df.columns)
    elif columns_to_use == "object":
        cols = list(df.columns[df.dtypes == "object"])
    else:
        cols = columns_to_use

    if drop_used_columns:
        columns_A = []
    else:
        columns_A = cols

    columns_B = []
    for c in cols:
        columns_B += resulting_columns[c]

    if drop_unused_columns:
        columns_C = []
    else:
        columns_C = [c for c in df.columns if c not in cols]

    final_columns = columns_A + columns_C + columns_B

    encoder = NumericalEncoder(columns_to_use=columns_to_use,
                               drop_used_columns=drop_used_columns,
                               drop_unused_columns=drop_unused_columns)

    df1_transformed = encoder.fit_transform(df1)
    df2_transformed = encoder.transform(df2)

    assert df1_transformed.shape[0] == df1.shape[0]
    assert df2_transformed.shape[0] == df2.shape[0]
    assert type(df1_transformed) == type(df1)
    assert type(df2_transformed) == type(df2)
    assert (df1_transformed.index == df1.index).all()
    assert (df2_transformed.index == df2.index).all()

    assert df1_transformed.shape[1] == df2_transformed.shape[1]
    assert list(df1_transformed.columns) == list(df2_transformed.columns)

    assert len(df1_transformed.columns) == len(final_columns)
    assert set(df1_transformed) == set(final_columns)

    #    assert list(df1_transformed.columns) == final_columns

    encoder = NumericalEncoder()
    encoder.fit(df)

    pickled_encoder = pickle.dumps(encoder)
    unpickled_encoder = pickle.loads(pickled_encoder)

    assert type(unpickled_encoder) == type(encoder)
    X1 = encoder.transform(df)
    X2 = unpickled_encoder.transform(df)

    assert X1.shape == X2.shape
    assert (X1 == X2).all().all()
예제 #6
0
def test_NumericalEncoder_dummy():

    ####################
    ### One Hot Mode ###
    ####################

    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="dummy")
    encoder.fit(df)
    res = encoder.transform(df)

    assert encoder.model._dummy_size == len(encoder.model._dummy_feature_names)
    assert encoder.model._dummy_size == sum(
        len(v) for k, v in encoder.model.variable_modality_mapping.items())

    assert res.shape[0] == df.shape[0]
    assert res.shape[1] == len(df["cat_col_1"].value_counts()) + len(
        df["cat_col_2"].value_counts()) + 3
    assert (res.index == df.index).all()

    col = ["float_col", "int_col", "text_col"]
    col1 = [
        "cat_col_1__%s" % c for c in list(df["cat_col_1"].value_counts().index)
    ]
    col2 = [
        "cat_col_2__%s" % c for c in list(df["cat_col_2"].value_counts().index)
    ]

    assert col1 == encoder.columns_mapping["cat_col_1"]
    assert col2 == encoder.columns_mapping["cat_col_2"]

    assert encoder.get_feature_names() == col + col1 + col2

    assert (res.loc[:, col1 + col2]).isnull().sum().sum() == 0
    assert (res.loc[:, col1 + col2]).max().max() == 1
    assert (res.loc[:, col1 + col2]).min().min() == 0

    assert ((df["cat_col_1"] == "aaa") == (res["cat_col_1__aaa"] == 1)).all()

    df2 = df.copy()
    df2.loc[0, "cat_col_1"] = "something-new"
    df2.loc[1, "cat_col_2"] = None  # Something None

    res2 = encoder.transform(df2)

    assert res2.loc[0, col1].sum() == 0  # no dummy activated
    assert res2.loc[
        0, "cat_col_2__" +
        df2.loc[0, "cat_col_2"]] == 1  # activated in the right position
    assert res2.loc[0, col2].sum() == 1  # only one dummy activate

    assert res2.loc[1, col2].sum() == 0  # no dummy activated
    assert res2.loc[
        1, "cat_col_1__" +
        df2.loc[1, "cat_col_1"]] == 1  # activated in the right position
    assert res2.loc[1, col1].sum() == 1

    df_with_none = df.copy()
    df_with_none["cat_col_3"] = df_with_none["cat_col_1"]
    df_with_none.loc[0:25, "cat_col_3"] = None

    encoder2 = NumericalEncoder(encoding_type="dummy")
    res2 = encoder2.fit_transform(df_with_none)

    col3b = [c for c in res2.columns if c.startswith("cat_col_3")]
    assert col3b[0] == "cat_col_3____null__"
    assert list(res2.columns) == col + col1 + col2 + col3b
    assert list(res2.columns) == encoder2.get_feature_names()

    assert (res2.loc[:, col1 + col2 + col3b]).isnull().sum().sum() == 0
    assert (res2.loc[:, col1 + col2 + col3b]).max().max() == 1
    assert (res2.loc[:, col1 + col2 + col3b]).min().min() == 0

    assert (df_with_none["cat_col_3"].isnull() == (
        res2["cat_col_3____null__"] == 1)).all()

    df3 = df.copy()
    df3["cat_col_many"] = [
        "m_%d" % x
        for x in np.ceil(np.minimum(np.exp(np.random.rand(100) *
                                           5), 50)).astype(np.int32)
    ]

    encoder3 = NumericalEncoder(encoding_type="dummy")
    res3 = encoder3.fit_transform(df3)

    colm = [c for c in res3.columns if c.startswith("cat_col_many")]
    vc = df3["cat_col_many"].value_counts()
    colmb = [
        "cat_col_many__" + c
        for c in list(vc.index[vc >= encoder3.min_nb_observations]) +
        ["__default__"]
    ]

    assert colm == colmb