Esempio n. 1
0
def test_add_indicators_to_one_variable(df_na):
    imputer = AddMissingIndicator(variables="Name")
    X_transformed = imputer.fit_transform(df_na)
    assert imputer.variables_ == ["Name"]
    assert X_transformed.shape == (8, 7)
    assert "Name_na" in X_transformed.columns
    assert X_transformed["Name_na"].sum() == 2
Esempio n. 2
0
def test_add_indicators_to_all_variables_when_variables_is_none(df_na):
    imputer = AddMissingIndicator(missing_only=False, variables=None)
    X_transformed = imputer.fit_transform(df_na)
    assert imputer.variables_ == [
        "Name", "City", "Studies", "Age", "Marks", "dob"
    ]
    assert X_transformed.shape == (8, 12)
    assert "dob_na" in X_transformed.columns
    assert X_transformed["dob_na"].sum() == 0
Esempio n. 3
0
def test_detect_variables_with_missing_data_in_variables_entered_by_user(
        df_na):
    imputer = AddMissingIndicator(missing_only=True,
                                  variables=["City", "Studies", "Age", "dob"])
    X_transformed = imputer.fit_transform(df_na)
    assert imputer.variables == ["City", "Studies", "Age", "dob"]
    assert imputer.variables_ == ["City", "Studies", "Age"]
    assert X_transformed.shape == (8, 9)
    assert "City_na" in X_transformed.columns
    assert "dob_na" not in X_transformed.columns
    assert X_transformed["City_na"].sum() == 2
Esempio n. 4
0
def test_detect_variables_with_missing_data_when_variables_is_none(df_na):
    # test case 1: automatically detect variables with missing data
    imputer = AddMissingIndicator(missing_only=True, variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # init params
    assert imputer.missing_only is True
    assert imputer.variables is None
    # fit params
    assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"]
    assert imputer.n_features_in_ == 6
    # transform outputs
    assert X_transformed.shape == (8, 11)
    assert "Name_na" in X_transformed.columns
    assert X_transformed["Name_na"].sum() == 2
Esempio n. 5
0
def test_get_feature_names_out_from_pipeline(df_na):
    original_features = df_na.columns.to_list()

    tr = Pipeline([("transformer", AddMissingIndicator(missing_only=False))])
    tr.fit(df_na)

    out = [f + "_na" for f in original_features]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=original_features) == out
    assert tr.get_feature_names_out(
        input_features=original_features[0:2]) == out[0:2]
    assert tr.get_feature_names_out(input_features=["Name"]) == ["Name_na"]
Esempio n. 6
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
Esempio n. 7
0
def test_get_feature_names_out(df_na):
    original_features = df_na.columns.to_list()

    tr = AddMissingIndicator(missing_only=False)
    tr.fit(df_na)

    out = [f + "_na" for f in original_features]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=original_features) == out
    assert tr.get_feature_names_out(
        input_features=original_features[0:2]) == out[0:2]
    assert tr.get_feature_names_out(input_features=["Name"]) == ["Name_na"]

    tr = AddMissingIndicator(missing_only=True)
    tr.fit(df_na)

    out = [f + "_na" for f in original_features[0:-1]]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=original_features) == out

    with pytest.raises(ValueError):
        tr.get_feature_names_out("Name")

    with pytest.raises(ValueError):
        tr.get_feature_names_out(["Name", "hola"])
Esempio n. 8
0
def test_error_when_missing_only_not_bool():
    with pytest.raises(ValueError):
        AddMissingIndicator(missing_only="missing_only")
X_train = cat_imputer_frequent.transform(X_train)
X_test = cat_imputer_frequent.transform(X_test)

# %% Varief whether there are missing value.
X_train[cat_vars_with_na].isnull().sum()
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]
# %% Numerical variables.
num_vars = [var for var in X_train.columns if var not in cat_vars and var != 'SalePrice']
vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]
print(len(vars_with_na))

X_train[vars_with_na].isnull().mean()

# %% Missing values -- Numerical -- add missing indicator.
missing_ind = AddMissingIndicator(variables=vars_with_na)
missing_ind.fit(X_train)
X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

# check the binary missing indicator variables
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()
# %% # %% Missing values -- Numerical -- add missing indicator.
mean_imputer = MeanMedianImputer(
    imputer_method='mean',
    variables=vars_with_na
)
mean_imputer.fit(X_train)
print(mean_imputer.imputer_dict_)

X_train = mean_imputer.transform(X_train)
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
def test_non_fitted_error(df_na):
    with pytest.raises(NotFittedError):
        imputer = AddMissingIndicator()
        imputer.transform(df_na)
Esempio n. 12
0
     CategoricalImputer(
         imputation_method="missing",
         variables=config.model_config.categorical_vars_with_na_missing,
     ),
 ),
 (
     "frequent_imputation",
     CategoricalImputer(
         imputation_method="frequent",
         variables=config.model_config.categorical_vars_with_na_frequent,
     ),
 ),
 # add missing indicator
 (
     "missing_indicator",
     AddMissingIndicator(
         variables=config.model_config.numerical_vars_with_na),
 ),
 # impute numerical variables with the mean
 (
     "mean_imputation",
     MeanMedianImputer(
         imputation_method="mean",
         variables=config.model_config.numerical_vars_with_na,
     ),
 ),
 # == TEMPORAL VARIABLES ====
 (
     "elapsed_time",
     pp.TemporalVariableTransformer(
         variables=config.model_config.temporal_vars,
         reference_variable=config.model_config.ref_var,