Ejemplo n.º 1
0
def test_impute_numerical_variables_with_mode(df_na):
    # set up transformer
    imputer = CategoricalImputer(
        imputation_method="frequent",
        variables=["City", "Studies", "Marks"],
        ignore_format=True,
    )
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["City"] = X_reference["City"].fillna("London")
    X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor")
    X_reference["Marks"] = X_reference["Marks"].fillna(0.8)

    # test init params
    assert imputer.variables == ["City", "Studies", "Marks"]

    # test fit attributes
    assert imputer.variables_ == ["City", "Studies", "Marks"]
    assert imputer.n_features_in_ == 6
    assert imputer.imputer_dict_ == {
        "City": "London",
        "Studies": "Bachelor",
        "Marks": 0.8,
    }

    # test transform output
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 2
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
Ejemplo n.º 3
0
def test_variables_cast_as_category_missing(df_na):
    # string missing
    df_na = df_na.copy()
    df_na["City"] = df_na["City"].astype("category")

    imputer = CategoricalImputer(imputation_method="missing", variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["Name"] = X_reference["Name"].fillna("Missing")
    X_reference["Studies"] = X_reference["Studies"].fillna("Missing")

    X_reference["City"].cat.add_categories("Missing", inplace=True)
    X_reference["City"] = X_reference["City"].fillna("Missing")

    # test fit attributes
    assert imputer.variables_ == ["Name", "City", "Studies"]
    assert imputer.imputer_dict_ == {
        "Name": "Missing",
        "City": "Missing",
        "Studies": "Missing",
    }

    # test transform output
    # selected columns should have no NA
    # non selected columns should still have NA
    assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 4
0
def test_variables_cast_as_category_frequent(df_na):

    df_na = df_na.copy()
    df_na["City"] = df_na["City"].astype("category")

    # this variable does not have a mode, so drop
    df_na.drop(labels=["Name"], axis=1, inplace=True)

    imputer = CategoricalImputer(imputation_method="frequent", variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor")
    X_reference["City"] = X_reference["City"].fillna("London")

    # test fit attributes
    assert imputer.variables_ == ["City", "Studies"]
    assert imputer.imputer_dict_ == {
        "City": "London",
        "Studies": "Bachelor",
    }

    # test transform output
    # selected columns should have no NA
    # non selected columns should still have NA
    assert X_transformed[["City", "Studies"]].isnull().sum().sum() == 0
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 5
0
def test_impute_with_string_missing_and_automatically_find_variables(df_na):
    # set up transformer
    imputer = CategoricalImputer(imputation_method="missing", variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["Name"] = X_reference["Name"].fillna("Missing")
    X_reference["City"] = X_reference["City"].fillna("Missing")
    X_reference["Studies"] = X_reference["Studies"].fillna("Missing")

    # test init params
    assert imputer.imputation_method == "missing"
    assert imputer.variables is None

    # test fit attributes
    assert imputer.variables_ == ["Name", "City", "Studies"]
    assert imputer.n_features_in_ == 6
    assert imputer.imputer_dict_ == {
        "Name": "Missing",
        "City": "Missing",
        "Studies": "Missing",
    }

    # test transform output
    # selected columns should have no NA
    # non selected columns should still have NA
    assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 6
0
def test_user_defined_string_and_automatically_find_variables(df_na):
    # set up imputer
    imputer = CategoricalImputer(
        imputation_method="missing", fill_value="Unknown", variables=None
    )
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["Name"] = X_reference["Name"].fillna("Unknown")
    X_reference["City"] = X_reference["City"].fillna("Unknown")
    X_reference["Studies"] = X_reference["Studies"].fillna("Unknown")

    # test init params
    assert imputer.imputation_method == "missing"
    assert imputer.fill_value == "Unknown"
    assert imputer.variables is None

    # tes fit attributes
    assert imputer.variables_ == ["Name", "City", "Studies"]
    assert imputer.n_features_in_ == 6
    assert imputer.imputer_dict_ == {
        "Name": "Unknown",
        "City": "Unknown",
        "Studies": "Unknown",
    }

    # test transform output:
    assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 7
0
def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_na):
    # test case 6: imputing of numerical variables cast as object + return as object
    # after imputation
    df_na = df_na.copy()
    df_na["Marks"] = df_na["Marks"].astype("O")
    imputer = CategoricalImputer(
        imputation_method="frequent",
        variables=["City", "Studies", "Marks"],
        return_object=True,
    )
    X_transformed = imputer.fit_transform(df_na)
    assert X_transformed["Marks"].dtype == "O"
def test_mode_imputation_with_multiple_variables(df_na):
    # set up imputer
    imputer = CategoricalImputer(imputation_method="frequent",
                                 variables=["Studies", "City"])
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference["City"] = X_reference["City"].fillna("London")
    X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor")

    # test fit attr and transform output
    assert imputer.imputer_dict_ == {"Studies": "Bachelor", "City": "London"}
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 9
0
def test_mode_imputation_and_single_variable(df_na):
    # set up imputer
    imputer = CategoricalImputer(imputation_method="frequent", variables="City")
    X_transformed = imputer.fit_transform(df_na)

    # set up expected result
    X_reference = df_na.copy()
    X_reference["City"] = X_reference["City"].fillna("London")

    # test init, fit and transform params, attr and output
    assert imputer.imputation_method == "frequent"
    assert imputer.variables == ["City"]
    assert imputer.input_shape_ == (8, 6)
    assert imputer.imputer_dict_ == {"City": "London"}
    assert X_transformed["City"].isnull().sum() == 0
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical(
        df_na):
    # test case: imputing of numerical variables cast as object + return numeric
    df_na["Marks"] = df_na["Marks"].astype("O")
    imputer = CategoricalImputer(imputation_method="frequent",
                                 variables=["City", "Studies", "Marks"])
    X_transformed = imputer.fit_transform(df_na)

    X_reference = df_na.copy()
    X_reference["Marks"] = X_reference["Marks"].fillna(0.8)
    X_reference["City"] = X_reference["City"].fillna("London")
    X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor")
    assert imputer.variables == ["City", "Studies", "Marks"]
    assert imputer.imputer_dict_ == {
        "Studies": "Bachelor",
        "City": "London",
        "Marks": 0.8,
    }
    assert X_transformed["Marks"].dtype == "float"
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 11
0
def test_impute_numerical_variables(df_na):
    # set up transformer
    imputer = CategoricalImputer(
        imputation_method="missing",
        fill_value=0,
        variables=["Name", "City", "Studies", "Age", "Marks"],
        ignore_format=True,
    )
    X_transformed = imputer.fit_transform(df_na)

    # set up expected output
    X_reference = df_na.copy()
    X_reference = X_reference.fillna(0)

    # test init params
    assert imputer.imputation_method == "missing"
    assert imputer.variables == ["Name", "City", "Studies", "Age", "Marks"]

    # test fit attributes
    assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"]
    assert imputer.n_features_in_ == 6

    # test transform params
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Ejemplo n.º 12
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
def test_non_fitted_error(df_na):
    with pytest.raises(NotFittedError):
        imputer = CategoricalImputer()
        imputer.transform(df_na)
)
from feature_engine.transformation import (
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
Ejemplo n.º 15
0
def test_error_when_variable_contains_multiple_modes(df_na):

    msg = "The variable Name contains multiple frequent categories."
    imputer = CategoricalImputer(imputation_method="frequent", variables="Name")
    with pytest.raises(ValueError) as record:
        imputer.fit(df_na)
    # check that error message matches
    assert str(record.value) == msg

    msg = "The variable(s) Name contain(s) multiple frequent categories."
    imputer = CategoricalImputer(imputation_method="frequent")
    with pytest.raises(ValueError) as record:
        imputer.fit(df_na)
    # check that error message matches
    assert str(record.value) == msg

    df_ = df_na.copy()
    df_["Name_dup"] = df_["Name"]
    msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories."
    imputer = CategoricalImputer(imputation_method="frequent")
    with pytest.raises(ValueError) as record:
        imputer.fit(df_)
    # check that error message matches
    assert str(record.value) == msg
Ejemplo n.º 16
0
def test_error_when_imputation_method_not_frequent_or_missing():
    with pytest.raises(ValueError):
        CategoricalImputer(imputation_method="arbitrary")
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.transformation import (
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
Ejemplo n.º 18
0
from feature_engine.transformation import LogTransformer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, MinMaxScaler

from regression_model.config.core import config
from regression_model.processing import features as pp

price_pipe = Pipeline([
    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    (
        "missing_imputation",
        CategoricalImputer(
            imputation_method="missing",
            variables=config.model_config.categorical_vars_with_na_missing,
        ),
    ),
    (
        "frequent_imputation",
        CategoricalImputer(
            imputation_method="frequent",
            variables=config.model_config.categorical_vars_with_na_frequent,
        ),
    ),
    # add missing indicator
    (
        "missing_indicator",
        AddMissingIndicator(
            variables=config.model_config.numerical_vars_with_na),
    ),
Ejemplo n.º 19
0
# %% Categorical.
X_train['MSSubClass'] = X_train['MSSubClass'].astype('O')
X_test['MSSubClass'] = X_test['MSSubClass'].astype('O')

cat_vars = [var for var in data.columns if data[var].dtype == 'O']
cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0]

# variables to impute with the string missing. ()
with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]
with_frequent_category = [var for var in cat_vars_with_na if X_train[var].isnull().mean() < 0.1]

# variables to impute with the most frequent category
# %% Missing values -- Categorical -- Missing.
cat_imputer_missing = CategoricalImputer(
    imputation_method='missing',
    variables=with_string_missing
)
cat_imputer_missing.fit(X_train)
print(cat_imputer_missing.imputer_dict_)

X_train = cat_imputer_missing.transform(X_train)
X_test = cat_imputer_missing.transform(X_test)

# %% Missing values -- Categorical -- Frequency.
cat_imputer_frequent = CategoricalImputer(
    imputation_method='frequent',
    variables=with_frequent_category
)
cat_imputer_frequent.fit(X_train)
print(cat_imputer_frequent.imputer_dict_)
def test_error_when_variable_contains_multiple_modes(df_na):
    with pytest.raises(ValueError):
        imputer = CategoricalImputer(imputation_method="frequent")
        imputer.fit(df_na)
Ejemplo n.º 21
0
    #get numerical labels
    numerical_labels = list(X_train._get_numeric_data().columns)
    categorical_labels = X_train.select_dtypes(
        include=['object']).columns.tolist()

    #moving 'MSSubClass' feature from numerical to categorical
    numerical_labels.remove('MSSubClass')
    categorical_labels.append('MSSubClass')

    print(f'Numerical labels are (contains ordinal cat):{numerical_labels}')
    print(f'Categorical labels are:{categorical_labels}')
    #print(X_train.head())

    num_pipeline = Pipeline([
        ('imputer', MeanMedianImputer(imputation_method='median'))  #,
        #('std_scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([('imputer',
                              CategoricalImputer(imputation_method='missing',
                                                 fill_value='Missing')),
                             ('one_hot',
                              OneHotEncoder(top_categories=None,
                                            drop_last=False))])

    full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels),
                                       ('cat', cat_pipeline,
                                        categorical_labels)])

    X_converted = cat_pipeline.fit_transform(X_train)
    print(X_converted.head())
Ejemplo n.º 22
0
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, CountFrequencyEncoder
from feature_engine.discretisation import EqualFrequencyDiscretiser

import logging

_logger = logging.getLogger(__name__)

rf_pipe = Pipeline(
[
    ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)),
    
    ('categorical_impute', CategoricalImputer(imputation_method='missing', 
                                              variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES)),
    
    ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10,
                                           variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES,
                                            replace_with='Rare')),
    
    ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', 
                                          variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),
    
    ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', 
                                          variables=config.DISCRETE_SET1_FEATURES)),
    
    ('categorical_encode3', CountFrequencyEncoder(encoding_method='count',