def test_log_base_e_plus_automatically_find_variables(df_vartypes):
    # test case 1: log base e, automatically select variables
    transformer = LogTransformer(base="e", variables=None)
    X = transformer.fit_transform(df_vartypes)

    # expected output
    transf_df = df_vartypes.copy()
    transf_df["Age"] = [2.99573, 3.04452, 2.94444, 2.89037]
    transf_df["Marks"] = [-0.105361, -0.223144, -0.356675, -0.510826]

    # test init params
    assert transformer.base == "e"
    assert transformer.variables is None
    # test fit attr
    assert transformer.variables_ == ["Age", "Marks"]
    assert transformer.n_features_in_ == 5
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)

    # test inverse_transform
    Xit = transformer.inverse_transform(X)

    # convert numbers to original format.
    Xit["Age"] = Xit["Age"].round().astype("int64")
    Xit["Marks"] = Xit["Marks"].round(1)

    # test
    pd.testing.assert_frame_equal(Xit, df_vartypes)
def test_log_base_10_plus_user_passes_var_list(df_vartypes):
    # test case 2: log base 10, user passes variables
    transformer = LogTransformer(base="10", variables="Age")
    X = transformer.fit_transform(df_vartypes)

    # expected output
    transf_df = df_vartypes.copy()
    transf_df["Age"] = [1.30103, 1.32222, 1.27875, 1.25527]

    # test init params
    assert transformer.base == "10"
    assert transformer.variables == "Age"
    # test fit attr
    assert transformer.variables_ == ["Age"]
    assert transformer.n_features_in_ == 5
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)

    # test inverse_transform
    Xit = transformer.inverse_transform(X)

    # convert numbers to original format.
    Xit["Age"] = Xit["Age"].round().astype("int64")

    # test
    pd.testing.assert_frame_equal(Xit, df_vartypes)
def test_error_if_df_contains_negative_values(df_vartypes):
    # test error when data contains negative values
    df_neg = df_vartypes.copy()
    df_neg.loc[1, "Age"] = -1

    # test case 5: when variable contains negative value, fit
    with pytest.raises(ValueError):
        transformer = LogTransformer()
        transformer.fit(df_neg)

    # test case 6: when variable contains negative value, transform
    with pytest.raises(ValueError):
        transformer = LogTransformer()
        transformer.fit(df_vartypes)
        transformer.transform(df_neg)
def test_log_base_10_plus_user_passes_var_list(df_vartypes):
    # test case 2: log base 10, user passes variables
    transformer = LogTransformer(base="10", variables="Age")
    X = transformer.fit_transform(df_vartypes)

    # expected output
    transf_df = df_vartypes.copy()
    transf_df["Age"] = [1.30103, 1.32222, 1.27875, 1.25527]

    # test init params
    assert transformer.base == "10"
    assert transformer.variables == ["Age"]
    # test fit attr
    assert transformer.input_shape_ == (4, 5)
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)
def test_log_base_e_plus_automatically_find_variables(df_vartypes):
    # test case 1: log base e, automatically select variables
    transformer = LogTransformer(base="e", variables=None)
    X = transformer.fit_transform(df_vartypes)

    # expected output
    transf_df = df_vartypes.copy()
    transf_df["Age"] = [2.99573, 3.04452, 2.94444, 2.89037]
    transf_df["Marks"] = [-0.105361, -0.223144, -0.356675, -0.510826]

    # test init params
    assert transformer.base == "e"
    assert transformer.variables == ["Age", "Marks"]
    # test fit attr
    assert transformer.input_shape_ == (4, 5)
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)
def test_inverse_e_plus_user_passes_var_list(df_vartypes):
    # test case 7: inverse log, user passes variables
    transformer = LogTransformer(variables="Age")
    Xt = transformer.fit_transform(df_vartypes)
    X = transformer.inverse_transform(Xt)

    # convert floats to int
    X["Age"] = X["Age"].round().astype("int64")

    # test init params
    assert transformer.base == "e"
    assert transformer.variables == "Age"
    # test fit attr
    assert transformer.variables_ == ["Age"]
    assert transformer.n_features_in_ == 5
    # test transform output
    pd.testing.assert_frame_equal(X, df_vartypes)
Exemple #7
0
import pytest
from sklearn.utils.estimator_checks import check_estimator

from feature_engine.transformation import (
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)


@pytest.mark.parametrize(
    "Estimator",
    [
        BoxCoxTransformer(),
        LogTransformer(),
        PowerTransformer(),
        ReciprocalTransformer(),
        YeoJohnsonTransformer(),
    ],
)
def test_all_transformers(Estimator):
    return check_estimator(Estimator)
def test_non_fitted_error(df_vartypes):
    with pytest.raises(NotFittedError):
        transformer = LogTransformer()
        transformer.transform(df_vartypes)
def test_transform_raises_error_if_na_in_df(df_vartypes, df_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        transformer = LogTransformer()
        transformer.fit(df_vartypes)
        transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_fit_raises_error_if_na_in_df(df_na):
    # test case 3: when dataset contains na, fit method
    with pytest.raises(ValueError):
        transformer = LogTransformer()
        transformer.fit(df_na)
def test_error_if_base_value_not_allowed():
    with pytest.raises(ValueError):
        LogTransformer(base="other")
print(mean_imputer.imputer_dict_)

X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Varief whether there are missing value.
X_train[cat_vars_with_na].isnull().sum()
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]

#%% Temporal variables.
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

# now we drop YrSold.
drop_features = DropFeatures(features_to_drop=['YrSold'])
X_train = mean_imputer.fit_transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Numerical variable -- transformation.
log_transformer = LogTransformer(
    variables=["LotFrontage", "1stFlrSF", "GrLivArea"],
)
X_train = log_transformer.fit_transform(X_train)
X_test = log_transformer.transform(X_test)

# check that test set does not contain null values in the engineered variables
[var for var in ["LotFrontage", "1stFlrSF", "GrLivArea"] if X_test[var].isnull().sum() > 0]
Exemple #13
0
         imputation_method="mean",
         variables=config.model_config.numerical_vars_with_na,
     ),
 ),
 # == TEMPORAL VARIABLES ====
 (
     "elapsed_time",
     pp.TemporalVariableTransformer(
         variables=config.model_config.temporal_vars,
         reference_variable=config.model_config.ref_var,
     ),
 ),
 ("drop_features",
  DropFeatures(features_to_drop=[config.model_config.ref_var])),
 # ==== VARIABLE TRANSFORMATION =====
 ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
 (
     "binarizer",
     SklearnTransformerWrapper(
         transformer=Binarizer(threshold=0),
         variables=config.model_config.binarize_vars,
     ),
 ),
 # === mappers ===
 (
     "mapper_qual",
     pp.Mapper(
         variables=config.model_config.qual_vars,
         mappings=config.model_config.qual_mappings,
     ),
 ),