def test_impute_with_99_and_automatically_select_variables(df_na):
    # set up the transformer
    imputer = ArbitraryNumberImputer(arbitrary_number=99, variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # set up output reference
    X_reference = df_na.copy()
    X_reference["Age"] = X_reference["Age"].fillna(99)
    X_reference["Marks"] = X_reference["Marks"].fillna(99)

    # test init params
    assert imputer.arbitrary_number == 99
    assert imputer.variables is None

    # test fit attributes
    assert imputer.variables_ == ["Age", "Marks"]
    assert imputer.n_features_in_ == 6
    assert imputer.imputer_dict_ == {"Age": 99, "Marks": 99}

    # test transform output
    # selected variables should not contain NA
    # non selected variables should still contain NA
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0
    assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Exemple #2
0
def test_dictionary_of_imputation_values(df_na):
    # set up transformer
    imputer = ArbitraryNumberImputer(imputer_dict={"Age": -42, "Marks": -999})
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["Age"] = X_reference["Age"].fillna(-42)
    X_reference["Marks"] = X_reference["Marks"].fillna(-999)

    # test fit params
    assert imputer.input_shape_ == (8, 6)
    assert imputer.imputer_dict_ == {"Age": -42, "Marks": -999}

    # test transform params
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0
    assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Exemple #3
0
def test_impute_with_1_and_single_variable_entered_by_user(df_na):
    # set up transformer
    imputer = ArbitraryNumberImputer(arbitrary_number=-1, variables=["Age"])
    X_transformed = imputer.fit_transform(df_na)

    # set up output reference
    X_reference = df_na.copy()
    X_reference["Age"] = X_reference["Age"].fillna(-1)

    # test init params
    assert imputer.arbitrary_number == -1
    assert imputer.variables == ["Age"]

    # test fit attributes
    assert imputer.input_shape_ == (8, 6)
    assert imputer.imputer_dict_ == {"Age": -1}

    # test transform output
    assert X_transformed["Age"].isnull().sum() == 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Exemple #4
0
def imputer_error_when_dictionary_value_is_string():
    with pytest.raises(ValueError):
        ArbitraryNumberImputer(imputer_dict={"Age": "arbitrary_number"})
Exemple #5
0
def test_non_fitted_error(df_na):
    with pytest.raises(NotFittedError):
        imputer = ArbitraryNumberImputer()
        imputer.transform(df_na)
Exemple #6
0
def test_error_when_arbitrary_number_is_string():
    with pytest.raises(ValueError):
        ArbitraryNumberImputer(arbitrary_number="arbitrary")
)
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.transformation import (
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
Exemple #8
0
df_full[features] = df_full[features].dropna(how="all")
print("ok.")

# %%
# Separa entre treinamento e teste

print("Separando em base de treino e teste...")
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_full[features], df_full[target], random_state=42, test_size=0.1)

print("ok.")

# %%

print("Ajustando modelo em nosso pipeline...")
arbitrary_imputer = ArbitraryNumberImputer(arbitrary_number=-999,
                                           variables=features)

disc = DecisionTreeDiscretiser(cv=3,
                               scoring='roc_auc',
                               variables=features,
                               regression=False,
                               random_state=42)

pca = decomposition.PCA(n_components=120, random_state=42)

best_pars = {
    'subsample': 0.7,
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.2
}