def test_mean_imputation_and_automatically_select_variables(df_na):
    # set up transformer
    imputer = MeanMedianImputer(imputation_method="mean", variables=None)
    X_transformed = imputer.fit_transform(df_na)

    # set up reference result
    X_reference = df_na.copy()
    X_reference["Age"] = X_reference["Age"].fillna(28.714285714285715)
    X_reference["Marks"] = X_reference["Marks"].fillna(0.6833333333333332)

    # test init params
    assert imputer.imputation_method == "mean"
    assert imputer.variables == ["Age", "Marks"]

    # test fit attributes
    assert imputer.imputer_dict_ == {
        "Age": 28.714285714285715,
        "Marks": 0.6833333333333332,
    }
    assert imputer.input_shape_ == (8, 6)

    # test transform output:
    # selected variables should have no NA
    # not selected variables should still have NA
    assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0
    assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_median_imputation_when_user_enters_single_variables(df_na):
    # set up trasnformer
    imputer = MeanMedianImputer(imputation_method="median", variables=["Age"])
    X_transformed = imputer.fit_transform(df_na)

    # set up reference output
    X_reference = df_na.copy()
    X_reference["Age"] = X_reference["Age"].fillna(23.0)

    # test init params
    assert imputer.imputation_method == "median"
    assert imputer.variables == ["Age"]

    # test fit attributes
    assert imputer.input_shape_ == (8, 6)
    assert imputer.imputer_dict_ == {"Age": 23.0}

    # test transform output
    assert X_transformed["Age"].isnull().sum() == 0
    pd.testing.assert_frame_equal(X_transformed, X_reference)
Exemple #3
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
def missing_inputation():
    # Load dataset
    data = pd.read_csv("creditApprovalUCI.csv")

    # Separate into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop("A16", axis=1), data["A16"], test_size=0.3, random_state=0
    )

    # Set up the imputer
    median_imputer = MeanMedianImputer(
        imputation_method="median", variables=["A2", "A3", "A8", "A11", "A15"]
    )
    # fit the imputer
    median_imputer.fit(X_train)

    # transform the data
    X_train = median_imputer.transform(X_train)
    X_test = median_imputer.transform(X_test)
Exemple #5
0
    #get numerical labels
    numerical_labels = list(X_train._get_numeric_data().columns)
    categorical_labels = X_train.select_dtypes(
        include=['object']).columns.tolist()

    #moving 'MSSubClass' feature from numerical to categorical
    numerical_labels.remove('MSSubClass')
    categorical_labels.append('MSSubClass')

    print(f'Numerical labels are (contains ordinal cat):{numerical_labels}')
    print(f'Categorical labels are:{categorical_labels}')
    #print(X_train.head())

    num_pipeline = Pipeline([
        ('imputer', MeanMedianImputer(imputation_method='median'))  #,
        #('std_scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([('imputer',
                              CategoricalImputer(imputation_method='missing',
                                                 fill_value='Missing')),
                             ('one_hot',
                              OneHotEncoder(top_categories=None,
                                            drop_last=False))])

    full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels),
                                       ('cat', cat_pipeline,
                                        categorical_labels)])

    X_converted = cat_pipeline.fit_transform(X_train)
    print(X_converted.head())
def test_non_fitted_error(df_na):
    with pytest.raises(NotFittedError):
        imputer = MeanMedianImputer()
        imputer.transform(df_na)
def test_error_with_wrong_imputation_method():
    with pytest.raises(ValueError):
        MeanMedianImputer(imputation_method="arbitrary")
vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]
print(len(vars_with_na))

X_train[vars_with_na].isnull().mean()

# %% Missing values -- Numerical -- add missing indicator.
missing_ind = AddMissingIndicator(variables=vars_with_na)
missing_ind.fit(X_train)
X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

# check the binary missing indicator variables
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()
# %% # %% Missing values -- Numerical -- add missing indicator.
mean_imputer = MeanMedianImputer(
    imputer_method='mean',
    variables=vars_with_na
)
mean_imputer.fit(X_train)
print(mean_imputer.imputer_dict_)

X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Varief whether there are missing value.
X_train[cat_vars_with_na].isnull().sum()
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]

#%% Temporal variables.
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df
    SmartCorrelatedSelection,
)
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.transformation import (
    BoxCoxTransformer,
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
Exemple #10
0
     CategoricalImputer(
         imputation_method="frequent",
         variables=config.model_config.categorical_vars_with_na_frequent,
     ),
 ),
 # add missing indicator
 (
     "missing_indicator",
     AddMissingIndicator(
         variables=config.model_config.numerical_vars_with_na),
 ),
 # impute numerical variables with the mean
 (
     "mean_imputation",
     MeanMedianImputer(
         imputation_method="mean",
         variables=config.model_config.numerical_vars_with_na,
     ),
 ),
 # == TEMPORAL VARIABLES ====
 (
     "elapsed_time",
     pp.TemporalVariableTransformer(
         variables=config.model_config.temporal_vars,
         reference_variable=config.model_config.ref_var,
     ),
 ),
 ("drop_features",
  DropFeatures(features_to_drop=[config.model_config.ref_var])),
 # ==== VARIABLE TRANSFORMATION =====
 ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
 (
Exemple #11
0
from classification_model.processing import preprocessors as pp
from classification_model.processing import features
from classification_model.config import config

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, CountFrequencyEncoder
from feature_engine.discretisation import EqualFrequencyDiscretiser

import logging

_logger = logging.getLogger(__name__)

rf_pipe = Pipeline(
[
    ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)),
    
    ('categorical_impute', CategoricalImputer(imputation_method='missing', 
                                              variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES)),
    
    ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10,
                                           variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES,
                                            replace_with='Rare')),
    
    ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', 
                                          variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),