def test_wrap_polynomial_features():
    # load data
    X = fetch_california_housing(as_frame=True).frame

    # prepare selectors
    tr = PolynomialFeatures()
    tr_wrap = SklearnTransformerWrapper(transformer=PolynomialFeatures())

    # Test:
    # When passing variable list
    varlist = ["MedInc", "HouseAge", "AveRooms", "AveBedrms"]
    tr_wrap.set_params(variables=varlist)

    Xt = pd.DataFrame(tr.fit_transform(X[varlist]),
                      columns=tr.get_feature_names_out(varlist))
    Xw = tr_wrap.fit_transform(X)

    pd.testing.assert_frame_equal(Xw, pd.concat([X, Xt], axis=1))
    assert Xw.shape[1] == len(X.columns) + len(
        tr.get_feature_names_out(varlist))

    # when variable list is None
    tr_wrap.set_params(variables=None)

    Xt = pd.DataFrame(tr.fit_transform(X), columns=tr.get_feature_names_out())
    Xw = tr_wrap.fit_transform(X)

    pd.testing.assert_frame_equal(Xw, pd.concat([X, Xt], axis=1))
    assert Xw.shape[1] == len(X.columns) + len(
        tr.get_feature_names_out(X.columns))
def test_wrap_transformers(transformer):
    # load data
    X = fetch_california_housing(as_frame=True).frame

    # prepare selectors
    tr = clone(transformer)
    tr_wrap = SklearnTransformerWrapper(transformer=transformer)

    # Test:
    # When passing variable list
    varlist = ["MedInc", "HouseAge", "AveRooms", "AveBedrms"]
    tr_wrap.set_params(variables=varlist)

    Xt = pd.DataFrame(tr.fit_transform(X[varlist]), columns=X[varlist].columns)
    Xw = tr_wrap.fit_transform(X)

    remaining = [f for f in X.columns if f not in varlist]

    assert Xt.shape[1] == 4
    assert Xw.shape[1] == 9
    pd.testing.assert_frame_equal(Xt, Xw[varlist])
    pd.testing.assert_frame_equal(X[remaining], Xw[remaining])

    # when variable list is None
    tr_wrap.set_params(variables=None)

    Xt = pd.DataFrame(tr.fit_transform(X), columns=X.columns)
    Xw = tr_wrap.fit_transform(X)

    pd.testing.assert_frame_equal(Xt, Xw)
def test_sklearn_standardscaler_object(df_vartypes):
    variables_to_scale = ["Name"]
    transformer = SklearnTransformerWrapper(transformer=StandardScaler(),
                                            variables=variables_to_scale)

    with pytest.raises(TypeError):
        transformer.fit_transform(df_vartypes)

    # init params
    assert isinstance(transformer.transformer, StandardScaler)
    assert transformer.variables == variables_to_scale
def test_sklearn_ohe_all_features(df_vartypes):
    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64))

    ref = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "Name_jack": [0, 0, 0, 1],
        "Name_krish": [0, 0, 1, 0],
        "Name_nick": [0, 1, 0, 0],
        "Name_tom": [1, 0, 0, 0],
        "City_Bristol": [0, 0, 0, 1],
        "City_Liverpool": [0, 0, 1, 0],
        "City_London": [1, 0, 0, 0],
        "City_Manchester": [0, 1, 0, 0],
        "Age_18": [0, 0, 0, 1],
        "Age_19": [0, 0, 1, 0],
        "Age_20": [1, 0, 0, 0],
        "Age_21": [0, 1, 0, 0],
        "Marks_0.6": [0, 0, 0, 1],
        "Marks_0.7": [0, 0, 1, 0],
        "Marks_0.8": [0, 1, 0, 0],
        "Marks_0.9": [1, 0, 0, 0],
        "dob_2020-02-24T00:00:00.000000000": [1, 0, 0, 0],
        "dob_2020-02-24T00:01:00.000000000": [0, 1, 0, 0],
        "dob_2020-02-24T00:02:00.000000000": [0, 0, 1, 0],
        "dob_2020-02-24T00:03:00.000000000": [0, 0, 0, 1],
    })

    transformed_df = transformer.fit_transform(df_vartypes)

    pd.testing.assert_frame_equal(ref, transformed_df)
Esempio n. 5
0
def test_sklearn_imputer_numeric_with_constant(dataframe_na):
    variables_to_impute = ['Age', 'Marks']
    na_variables_left_after_imputation = [
        col for col in dataframe_na.loc[:, dataframe_na.isna().any()].columns
        if col not in variables_to_impute
    ]

    transformer = SklearnTransformerWrapper(transformer=SimpleImputer(
        fill_value=-999, strategy='constant'),
                                            variables=variables_to_impute)

    # transformed dataframe
    ref = dataframe_na.copy()
    ref[variables_to_impute] = ref[variables_to_impute].fillna(-999)

    dataframe_na_transformed = transformer.fit_transform(dataframe_na)

    # init params
    assert isinstance(transformer.transformer, SimpleImputer)
    assert transformer.variables == variables_to_impute
    # fit params
    assert transformer.input_shape_ == (8, 6)
    # transformed output
    assert all(dataframe_na_transformed[na_variables_left_after_imputation].
               isna().sum() != 0)
    assert all(dataframe_na_transformed[variables_to_impute].isna().sum() == 0)
    pd.testing.assert_frame_equal(ref, dataframe_na_transformed)
def test_sklearn_ohe_numeric(df_vartypes):
    variables_to_encode = ["Age"]

    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64),
        variables=variables_to_encode,
    )

    ref = pd.DataFrame({
        "Age": [20, 21, 19, 18],
        "Age_18": [0, 0, 0, 1],
        "Age_19": [0, 0, 1, 0],
        "Age_20": [1, 0, 0, 0],
        "Age_21": [0, 1, 0, 0],
    })

    transformed_df = transformer.fit_transform(
        df_vartypes[variables_to_encode])

    # init params
    assert isinstance(transformer.transformer, OneHotEncoder)
    assert transformer.variables == variables_to_encode
    # fit params
    assert transformer.input_shape_ == (4, 1)
    pd.testing.assert_frame_equal(ref, transformed_df)
def test_sklearn_ohe_object_many_features(df_vartypes):
    variables_to_encode = ["Name", "City"]

    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64),
        variables=variables_to_encode,
    )

    ref = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Name_jack": [0, 0, 0, 1],
        "Name_krish": [0, 0, 1, 0],
        "Name_nick": [0, 1, 0, 0],
        "Name_tom": [1, 0, 0, 0],
        "City_Bristol": [0, 0, 0, 1],
        "City_Liverpool": [0, 0, 1, 0],
        "City_London": [1, 0, 0, 0],
        "City_Manchester": [0, 1, 0, 0],
    })

    transformed_df = transformer.fit_transform(
        df_vartypes[variables_to_encode])

    # init params
    assert isinstance(transformer.transformer, OneHotEncoder)
    assert transformer.variables == variables_to_encode
    # fit params
    assert transformer.input_shape_ == (4, 2)
    pd.testing.assert_frame_equal(ref, transformed_df)
def test_get_feature_names_out_polynomialfeatures(varlist):
    X = fetch_california_housing(as_frame=True).frame
    tr_wrap = SklearnTransformerWrapper(transformer=PolynomialFeatures(),
                                        variables=varlist)
    Xw = tr_wrap.fit_transform(X)
    assert Xw.columns.tolist() == tr_wrap.get_feature_names_out()

    if varlist is not None:
        output_feat = [
            "1",
            "MedInc",
            "HouseAge",
            "AveRooms",
            "AveBedrms",
            "MedInc^2",
            "MedInc HouseAge",
            "MedInc AveRooms",
            "MedInc AveBedrms",
            "HouseAge^2",
            "HouseAge AveRooms",
            "HouseAge AveBedrms",
            "AveRooms^2",
            "AveRooms AveBedrms",
            "AveBedrms^2",
        ]

        assert output_feat == tr_wrap.get_feature_names_out(varlist)
def test_sklearn_ohe_object_one_feature(df_vartypes):
    variables_to_encode = ["Name"]

    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64),
        variables=variables_to_encode,
    )

    ref = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "Name_jack": [0, 0, 0, 1],
        "Name_krish": [0, 0, 1, 0],
        "Name_nick": [0, 1, 0, 0],
        "Name_tom": [1, 0, 0, 0],
    })

    transformed_df = transformer.fit_transform(
        df_vartypes[variables_to_encode])

    # init params
    assert isinstance(transformer.transformer, OneHotEncoder)
    assert transformer.variables == variables_to_encode
    # fit params
    assert transformer.n_features_in_ == 1
    pd.testing.assert_frame_equal(ref, transformed_df)
Esempio n. 10
0
def test_sklearn_ohe_object_many_features(dataframe_vartypes):
    variables_to_encode = ['Name', 'City']

    transformer = SklearnTransformerWrapper(transformer=OneHotEncoder(
        sparse=False, dtype=np.int64),
                                            variables=variables_to_encode)

    ref = pd.DataFrame({
        'Name': ['tom', 'nick', 'krish', 'jack'],
        'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
        'Name_jack': [0, 0, 0, 1],
        'Name_krish': [0, 0, 1, 0],
        'Name_nick': [0, 1, 0, 0],
        'Name_tom': [1, 0, 0, 0],
        'City_Bristol': [0, 0, 0, 1],
        'City_Liverpool': [0, 0, 1, 0],
        'City_London': [1, 0, 0, 0],
        'City_Manchester': [0, 1, 0, 0]
    })

    transformed_df = transformer.fit_transform(
        dataframe_vartypes[variables_to_encode])

    # init params
    assert isinstance(transformer.transformer, OneHotEncoder)
    assert transformer.variables == variables_to_encode
    # fit params
    assert transformer.input_shape_ == (4, 2)
    pd.testing.assert_frame_equal(ref, transformed_df)
def test_sklearn_imputer_object_with_constant(df_na):
    variables_to_impute = ["Name", "City"]
    na_variables_left_after_imputation = [
        col for col in df_na.loc[:, df_na.isna().any()].columns
        if col not in variables_to_impute
    ]

    transformer = SklearnTransformerWrapper(
        transformer=SimpleImputer(fill_value="missing", strategy="constant"),
        variables=variables_to_impute,
    )

    # transformed dataframe
    ref = df_na.copy()
    ref[variables_to_impute] = ref[variables_to_impute].fillna("missing")

    dataframe_na_transformed = transformer.fit_transform(df_na)

    # init params
    assert isinstance(transformer.transformer, SimpleImputer)
    assert transformer.variables == variables_to_impute
    # fit params
    assert transformer.input_shape_ == (8, 6)
    # transformed output
    assert all(dataframe_na_transformed[na_variables_left_after_imputation].
               isna().sum() != 0)
    assert all(dataframe_na_transformed[variables_to_impute].isna().sum() == 0)
    pd.testing.assert_frame_equal(ref, dataframe_na_transformed)
def test_get_feature_names_out_transformers(varlist, transformer):

    X = fetch_california_housing(as_frame=True).frame
    tr_wrap = SklearnTransformerWrapper(transformer=transformer,
                                        variables=varlist)
    Xw = tr_wrap.fit_transform(X)

    assert Xw.columns.to_list() == tr_wrap.get_feature_names_out()
    assert Xw.columns.to_list() == tr_wrap.get_feature_names_out(
        ["MedInc", "HouseAge"])
def test_function_transformer_works_with_numericals():
    X = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})

    X_expected = pd.DataFrame({"col1": [2, 3, 4], "col2": ["a", "b", "c"]})

    transformer = SklearnTransformerWrapper(
        FunctionTransformer(lambda x: x + 1), variables=["col1"])

    X_tf = transformer.fit_transform(X)

    pd.testing.assert_frame_equal(X_expected, X_tf)
def test_get_feature_names_out_selectors(varlist, transformer):
    X = fetch_california_housing(as_frame=True).frame
    y = X["MedHouseVal"]
    X = X.drop(["MedHouseVal"], axis=1)
    tr_wrap = SklearnTransformerWrapper(transformer=transformer,
                                        variables=varlist)
    Xw = tr_wrap.fit_transform(X, y)

    assert Xw.columns.to_list() == tr_wrap.get_feature_names_out()
    assert Xw.columns.to_list() == tr_wrap.get_feature_names_out(
        ["MedInc", "HouseAge"])
def test_inverse_transform(transformer):
    X = fetch_california_housing(as_frame=True).frame
    X = X.drop(["Longitude"], axis=1)

    tr_wrap = SklearnTransformerWrapper(transformer=transformer)

    # When passing variable list
    varlist = ["MedInc", "HouseAge", "AveRooms", "AveBedrms"]
    tr_wrap.set_params(variables=varlist)
    X_tr = tr_wrap.fit_transform(X)
    X_inv = tr_wrap.inverse_transform(X_tr)

    pd.testing.assert_frame_equal(X_inv, X)

    # when variable list is None
    tr_wrap.set_params(variables=None)

    X_tr = tr_wrap.fit_transform(X)
    X_inv = tr_wrap.inverse_transform(X_tr)

    pd.testing.assert_frame_equal(X_inv, X)
def test_sklearn_imputer_allfeatures_with_constant(df_na):
    transformer = SklearnTransformerWrapper(
        transformer=SimpleImputer(fill_value="missing", strategy="constant"))

    # transformed dataframe
    ref = df_na.copy()
    ref = ref.fillna("missing")

    dataframe_na_transformed = transformer.fit_transform(df_na)

    # transformed output
    assert all(dataframe_na_transformed.isna().sum() == 0)
    pd.testing.assert_frame_equal(ref, dataframe_na_transformed)
def test_function_transformer_works_with_categoricals():
    X = pd.DataFrame({"col1": ["1", "2", "3"], "col2": ["a", "b", "c"]})

    X_expected = pd.DataFrame({
        "col1": [1.0, 2.0, 3.0],
        "col2": ["a", "b", "c"]
    })

    transformer = SklearnTransformerWrapper(
        FunctionTransformer(lambda x: x.astype(np.float64)),
        variables=["col1"])

    X_tf = transformer.fit_transform(X)

    pd.testing.assert_frame_equal(X_expected, X_tf)
def test_wrap_selectors(transformer):
    # load data
    X = fetch_california_housing(as_frame=True).frame
    y = X["MedHouseVal"]
    X = X.drop(["MedHouseVal"], axis=1)

    # prepare selectors
    sel = clone(transformer)
    sel_wrap = SklearnTransformerWrapper(transformer=transformer)

    # Test:
    # When passing variable list
    varlist = ["MedInc", "HouseAge", "AveRooms", "AveBedrms"]
    sel_wrap.set_params(variables=varlist)

    Xt = pd.DataFrame(
        sel.fit_transform(X[varlist], y),
        columns=X[varlist].columns[(sel.get_support())],
    )
    Xw = sel_wrap.fit_transform(X, y)

    selected = X[varlist].columns[(sel.get_support())]
    remaining = [f for f in X.columns if f not in varlist]

    pd.testing.assert_frame_equal(Xt, Xw[selected])
    pd.testing.assert_frame_equal(X[remaining], Xw[remaining])
    assert Xw.shape[1] == len(remaining) + len(selected)

    # when variable list is None
    sel_wrap.set_params(variables=None)

    Xt = pd.DataFrame(sel.fit_transform(X, y),
                      columns=X.columns[(sel.get_support())])
    Xw = sel_wrap.fit_transform(X, y)

    pd.testing.assert_frame_equal(Xt, Xw)
Esempio n. 19
0
def test_sklearn_imputer_allfeatures_with_constant(dataframe_na):
    transformer = SklearnTransformerWrapper(
        transformer=SimpleImputer(fill_value='missing', strategy='constant'))

    # transformed dataframe
    ref = dataframe_na.copy()
    ref = ref.fillna('missing')

    dataframe_na_transformed = transformer.fit_transform(dataframe_na)

    # init params
    assert isinstance(transformer.transformer, SimpleImputer)
    # fit params
    assert transformer.input_shape_ == (8, 6)
    # transformed output
    assert all(dataframe_na_transformed.isna().sum() == 0)
    pd.testing.assert_frame_equal(ref, dataframe_na_transformed)
Esempio n. 20
0
def test_sklearn_standardscaler_allfeatures(dataframe_vartypes):
    transformer = SklearnTransformerWrapper(transformer=StandardScaler())

    ref = dataframe_vartypes.copy()
    variables_to_scale = list(ref.select_dtypes(include='number').columns)
    ref[variables_to_scale] = (
        ref[variables_to_scale] -
        ref[variables_to_scale].mean()) / ref[variables_to_scale].std(ddof=0)

    transformed_df = transformer.fit_transform(dataframe_vartypes)

    # init params
    assert isinstance(transformer.transformer, StandardScaler)
    assert transformer.variables == variables_to_scale
    # fit params
    assert transformer.input_shape_ == (4, 5)
    assert (transformer.transformer.mean_.round(6) == np.array([19.5,
                                                                0.75])).all()
    assert all(transformer.transformer.scale_.round(6) == [1.118034, 0.111803])
    pd.testing.assert_frame_equal(ref, transformed_df)
Esempio n. 21
0
def test_sklearn_standardscaler_numeric(dataframe_vartypes):
    variables_to_scale = ['Age', 'Marks']
    transformer = SklearnTransformerWrapper(transformer=StandardScaler(),
                                            variables=variables_to_scale)

    ref = dataframe_vartypes.copy()
    ref[variables_to_scale] = (
        ref[variables_to_scale] -
        ref[variables_to_scale].mean()) / ref[variables_to_scale].std(ddof=0)

    transformed_df = transformer.fit_transform(dataframe_vartypes)

    # init params
    assert isinstance(transformer.transformer, StandardScaler)
    assert transformer.variables == variables_to_scale
    # fit params
    assert transformer.input_shape_ == (4, 5)
    assert (transformer.transformer.mean_.round(6) == np.array([19.5,
                                                                0.75])).all()
    assert all(transformer.transformer.scale_.round(6) == [1.118034, 0.111803])
    pd.testing.assert_frame_equal(ref, transformed_df)
Esempio n. 22
0
def test_sklearn_ohe_all_features(dataframe_vartypes):
    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64))

    ref = pd.DataFrame({
        'Name': ['tom', 'nick', 'krish', 'jack'],
        'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
        'Age': [20, 21, 19, 18],
        'Marks': [0.9, 0.8, 0.7, 0.6],
        'dob': pd.date_range('2020-02-24', periods=4, freq='T'),
        'Name_jack': [0, 0, 0, 1],
        'Name_krish': [0, 0, 1, 0],
        'Name_nick': [0, 1, 0, 0],
        'Name_tom': [1, 0, 0, 0],
        'City_Bristol': [0, 0, 0, 1],
        'City_Liverpool': [0, 0, 1, 0],
        'City_London': [1, 0, 0, 0],
        'City_Manchester': [0, 1, 0, 0],
        'Age_18': [0, 0, 0, 1],
        'Age_19': [0, 0, 1, 0],
        'Age_20': [1, 0, 0, 0],
        'Age_21': [0, 1, 0, 0],
        'Marks_0.6': [0, 0, 0, 1],
        'Marks_0.7': [0, 0, 1, 0],
        'Marks_0.8': [0, 1, 0, 0],
        'Marks_0.9': [1, 0, 0, 0],
        'dob_2020-02-24T00:00:00.000000000': [1, 0, 0, 0],
        'dob_2020-02-24T00:01:00.000000000': [0, 1, 0, 0],
        'dob_2020-02-24T00:02:00.000000000': [0, 0, 1, 0],
        'dob_2020-02-24T00:03:00.000000000': [0, 0, 0, 1]
    })

    transformed_df = transformer.fit_transform(dataframe_vartypes)

    # init params
    assert isinstance(transformer.transformer, OneHotEncoder)
    # fit params
    assert transformer.input_shape_ == (4, 5)
    pd.testing.assert_frame_equal(ref, transformed_df)
def test_wrap_simple_imputer(df_na):
    variables_to_impute = ["Age", "Marks"]
    na_variables_left_after_imputation = [
        col for col in df_na.loc[:, df_na.isna().any()].columns
        if col not in variables_to_impute
    ]

    transformer = SklearnTransformerWrapper(
        transformer=SimpleImputer(fill_value=-999, strategy="constant"),
        variables=variables_to_impute,
    )

    # transformed dataframe
    ref = df_na.copy()
    ref[variables_to_impute] = ref[variables_to_impute].fillna(-999)

    dataframe_na_transformed = transformer.fit_transform(df_na)

    # transformed output
    assert all(dataframe_na_transformed[na_variables_left_after_imputation].
               isna().sum() != 0)
    assert all(dataframe_na_transformed[variables_to_impute].isna().sum() == 0)
    pd.testing.assert_frame_equal(ref, dataframe_na_transformed)
def test_get_feature_names_out_ohe(varlist, df_vartypes):

    transformer = SklearnTransformerWrapper(
        transformer=OneHotEncoder(sparse=False, dtype=np.int64),
        variables=varlist,
    )

    df_tr = transformer.fit_transform(df_vartypes)

    assert df_tr.columns.to_list() == transformer.get_feature_names_out()

    if varlist is not None:
        output_feat = [
            "Name_jack",
            "Name_krish",
            "Name_nick",
            "Name_tom",
            "City_Bristol",
            "City_Liverpool",
            "City_London",
            "City_Manchester",
        ]

        assert output_feat == transformer.get_feature_names_out(varlist)