コード例 #1
0
def test_detect_variables_with_na_in_variables_entered_by_user(df_na):
    imputer = DropMissingData(missing_only=True,
                              variables=["City", "Studies", "Age", "dob"])
    X_transformed = imputer.fit_transform(df_na)
    assert imputer.variables == ["City", "Studies", "Age", "dob"]
    assert imputer.variables_ == ["City", "Studies", "Age"]
    assert X_transformed.shape == (6, 6)
コード例 #2
0
def test_selelct_all_variables_with_na_when_variables_is_none(df_na):
    imputer = DropMissingData(missing_only=False, variables=None)
    X_transformed = imputer.fit_transform(df_na)
    assert imputer.n_features_in_ == 6
    assert imputer.variables_ == [
        "Name", "City", "Studies", "Age", "Marks", "dob"
    ]
    assert X_transformed.shape == (5, 6)
    assert X_transformed[imputer.variables_].isna().sum().sum() == 0
コード例 #3
0
def test_threshold_value_error(df_na):
    with pytest.raises(ValueError):
        DropMissingData(threshold=1.01)

    with pytest.raises(ValueError):
        DropMissingData(threshold=-0.01)

    with pytest.raises(ValueError):
        DropMissingData(threshold=0)
コード例 #4
0
def test_detect_variables_with_na(df_na):
    # test case 1: automatically detect variables with missing data
    imputer = DropMissingData(missing_only=True, variables=None)
    X_transformed = imputer.fit_transform(df_na)
    # init params
    assert imputer.missing_only is True
    assert imputer.variables is None
    # fit params
    assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"]
    assert imputer.n_features_in_ == 6
    # transform outputs
    assert X_transformed.shape == (5, 6)
    assert X_transformed["Name"].shape[0] == 5
    assert X_transformed.isna().sum().sum() == 0
コード例 #5
0
def test_return_na_data_method(df_na):

    # test with vars
    imputer = DropMissingData(threshold=0.5,
                              variables=["City", "Studies", "Age", "Marks"])
    imputer.fit_transform(df_na)
    X_nona = imputer.return_na_data(df_na)
    assert list(X_nona.index) == [2, 3]

    # test without vars & threshold
    imputer = DropMissingData()
    imputer.fit_transform(df_na)
    X_nona = imputer.return_na_data(df_na)
    assert list(X_nona.index) == [2, 3, 5]
コード例 #6
0
def test_threshold_with_variables(df_na):

    # Each row must have 100% data avaiable for columns ['Marks']
    imputer = DropMissingData(threshold=1, variables=["Marks"])
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 2, 4, 6, 7]

    # Each row must have 25% data avaiable for ['City', 'Studies', 'Age', 'Marks']
    imputer = DropMissingData(threshold=0.75,
                              variables=["City", "Studies", "Age", "Marks"])
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 4, 5, 6, 7]
コード例 #7
0
def test_non_fitted_error(df_na):
    with pytest.raises(NotFittedError):
        imputer = DropMissingData()
        imputer.transform(df_na)
コード例 #8
0
def test_error_when_missing_only_not_bool():
    with pytest.raises(ValueError):
        DropMissingData(missing_only="missing_only")
コード例 #9
0
def test_return_na_data_method(df_na):
    imputer = DropMissingData(missing_only=True,
                              variables=["City", "Studies", "Age", "dob"])
    imputer.fit(df_na)
    X_nona = imputer.return_na_data(df_na)
    assert X_nona.shape == (2, 6)
コード例 #10
0
def test_threshold(df_na):

    # Each row must have 100% data available
    imputer = DropMissingData(threshold=1)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 4, 6, 7]

    # Each row must have at least 1% data available
    imputer = DropMissingData(threshold=0.01)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 2, 3, 4, 5, 6, 7]

    # Each row must have at least 50% data available
    imputer = DropMissingData(threshold=0.50)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 2, 4, 5, 6, 7]

    # Each row must have 100% data available
    imputer = DropMissingData(threshold=1, missing_only=False)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 4, 6, 7]

    # Each row must have at least 1% data available
    imputer = DropMissingData(threshold=0.01, missing_only=False)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 2, 3, 4, 5, 6, 7]

    # Each row must have at least 50% data available
    imputer = DropMissingData(threshold=0.50, missing_only=False)
    X = imputer.fit_transform(df_na)
    assert list(X.index) == [0, 1, 2, 4, 5, 6, 7]
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,