Ejemplo n.º 1
0
def test_NumImputer_is_picklable():
    df = get_sample_df(100, seed=123)
    df.loc[[2, 10, 50], "float_col"] = np.nan

    imputer = NumImputer()
    _ = imputer.fit_transform(df)

    pickled_imputer = pickle.dumps(imputer)

    unpickled_imputer = pickle.loads(pickled_imputer)

    assert type(unpickled_imputer) == type(imputer)
    X1 = imputer.transform(df)
    X2 = unpickled_imputer.transform(df)

    assert X1.shape == X2.shape
    assert (X1 == X2).all().all()
Ejemplo n.º 2
0
def test_NumImputer_output_type():
    df = get_sample_df(100, seed=123)

    # with type float64
    df["float_col"].astype("float64")
    imp = NumImputer()
    Xenc = imp.fit_transform(df)

    assert Xenc.dtypes["float_col"] == df.dtypes["float_col"]

    # with type float32
    df2 = df.copy()
    df2["float_col"] = df2["float_col"].astype("float32")
    imp = NumImputer()
    Xenc = imp.fit_transform(df2)

    assert Xenc.dtypes["float_col"] == df2.dtypes["float_col"]

    # with type float64
    df["float_col"].astype("float64")
    df.loc[0, "float_col"] = np.nan

    imp = NumImputer()
    Xenc = imp.fit_transform(df)

    assert Xenc.dtypes["float_col"] == df.dtypes["float_col"]

    # with type float32
    df2 = df.copy()
    df2["float_col"] = df2["float_col"].astype("float32")
    df2.loc[0, "float_col"] = np.nan
    imp = NumImputer()
    Xenc = imp.fit_transform(df2)

    assert Xenc.dtypes["float_col"] == df2.dtypes["float_col"]
Ejemplo n.º 3
0
def test__NumImputer():

    xx, xxd, xxs = get_sample_data(add_na=True)
    xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14])

    # DataFrame entry
    for inp in (_NumImputer(), NumImputer(), _NumImputer(add_is_null=False),
                NumImputer(add_is_null=False)):
        xx_out = inp.fit_transform(xxd)
        assert (xx_out.index == xxd.index).all()
        assert pd.isnull(xxd.loc[0, "col1"])  # Verify that it is still null
        assert xx_out.isnull().sum().sum() == 0
        assert xx_out["col1"][0] == xxd.loc[~xxd["col1"].isnull(),
                                            "col1"].mean()

        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xxd)

        if inp.add_is_null:
            assert inp.get_feature_names() == [
                "col0", "col1", "col2", "col3", "col4", "col5", "col6",
                "col1_isnull"
            ]
            assert xx_out.shape[1] == 1 + xxd.shape[1]
            assert xx_out["col1_isnull"].iloc[0] == 1
            assert xx_out["col1_isnull"].iloc[5] == 1
            assert (xx_out["col1_isnull"].iloc[np.array(
                [1, 2, 3, 4, 6, 7, 8, 9])] == 0).all()

        else:
            assert xx_out.shape[1] == xxd.shape[1]
            assert inp.get_feature_names() == [
                "col0", "col1", "col2", "col3", "col4", "col5", "col6"
            ]

        inp = _NumImputer(add_is_null=False, allow_unseen_null=False)
        inp.fit(xxd)
        xxd2 = xxd.copy()
        xxd2.iloc[0, 3] = np.nan
        try:
            inp.transform(xxd2)
            raise AssertionError("Model should have fail its transformation")
        except ValueError:
            pass

    input_features = ["COL_%d" % i for i in range(xx.shape[1])]
    # Numpy array
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xx)
        assert pd.isnull(xx[0, 1])
        assert pd.isnull(xx_out).sum() == 0
        assert xx_out.shape[1] == 1 + xx.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xx)
        assert inp.get_feature_names() == [
            "0", "1", "2", "3", "4", "5", "6", "1_isnull"
        ]
        assert inp.get_feature_names(
            input_features) == input_features + ["COL_1_isnull"]
        assert xx_out[0, 7] == 1
        assert xx_out[5, 7] == 1
        assert (xx_out[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all()

    # Sparse Array
    for inp in (_NumImputer(), NumImputer()):
        for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix):
            xxsf = f(xxs.copy())
            xx_out = inp.fit_transform(xxsf)
            assert pd.isnull(xxs[0, 1])
            assert pd.isnull(xx_out.todense()).sum() == 0
            assert get_type(xx_out) == get_type(xxs)
            assert xx_out.shape[1] == 1 + xxs.shape[1]
            assert xx_out.shape[0] == xx.shape[0]
            assert inp.get_feature_names() == [
                "0", "1", "2", "3", "4", "5", "6", "1_isnull"
            ]
            assert inp.get_feature_names(
                input_features) == input_features + ["COL_1_isnull"]
            assert xx_out.todense()[0, 7] == 1
            assert xx_out.todense()[0, 7] == 1
            assert (xx_out.todense()[np.array([1, 2, 3, 4, 6, 7, 8, 9]),
                                     7] == 0).all()

    xx, xxd, xxs = get_sample_data(add_na=False)
    xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14])

    # DataFrame entry
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xxd)
        assert (xx_out.index == xxd.index).all()
        assert xx_out.isnull().sum().sum() == 0
        assert xx_out.shape[1] == xxd.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xxd)
        assert inp.get_feature_names() == [
            "col0", "col1", "col2", "col3", "col4", "col5", "col6"
        ]

    # Numpy array
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xx)
        assert pd.isnull(xx_out).sum() == 0
        assert xx_out.shape[1] == xx.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xx)
        assert inp.get_feature_names() == ["0", "1", "2", "3", "4", "5", "6"]
        assert inp.get_feature_names(
            input_features=input_features) == input_features

    # Sparse Array
    for inp in (_NumImputer(), NumImputer()):
        for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix):
            xxs_f = f(xxs.copy())
            xx_out = inp.fit_transform(xxs_f)
            assert pd.isnull(xx_out.todense()).sum() == 0
            assert get_type(xx_out) == get_type(xxs)
            assert xx_out.shape[1] == xxs.shape[1]
            assert xx_out.shape[0] == xx.shape[0]
            assert inp.get_feature_names() == [
                "0", "1", "2", "3", "4", "5", "6"
            ]
            assert inp.get_feature_names(
                input_features=input_features) == input_features