def test_mean_imputation_and_automatically_select_variables(df_na): # set up transformer imputer = MeanMedianImputer(imputation_method="mean", variables=None) X_transformed = imputer.fit_transform(df_na) # set up reference result X_reference = df_na.copy() X_reference["Age"] = X_reference["Age"].fillna(28.714285714285715) X_reference["Marks"] = X_reference["Marks"].fillna(0.6833333333333332) # test init params assert imputer.imputation_method == "mean" assert imputer.variables == ["Age", "Marks"] # test fit attributes assert imputer.imputer_dict_ == { "Age": 28.714285714285715, "Marks": 0.6833333333333332, } assert imputer.input_shape_ == (8, 6) # test transform output: # selected variables should have no NA # not selected variables should still have NA assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0 assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_median_imputation_when_user_enters_single_variables(df_na): # set up trasnformer imputer = MeanMedianImputer(imputation_method="median", variables=["Age"]) X_transformed = imputer.fit_transform(df_na) # set up reference output X_reference = df_na.copy() X_reference["Age"] = X_reference["Age"].fillna(23.0) # test init params assert imputer.imputation_method == "median" assert imputer.variables == ["Age"] # test fit attributes assert imputer.input_shape_ == (8, 6) assert imputer.imputer_dict_ == {"Age": 23.0} # test transform output assert X_transformed["Age"].isnull().sum() == 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
def missing_inputation(): # Load dataset data = pd.read_csv("creditApprovalUCI.csv") # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop("A16", axis=1), data["A16"], test_size=0.3, random_state=0 ) # Set up the imputer median_imputer = MeanMedianImputer( imputation_method="median", variables=["A2", "A3", "A8", "A11", "A15"] ) # fit the imputer median_imputer.fit(X_train) # transform the data X_train = median_imputer.transform(X_train) X_test = median_imputer.transform(X_test)
#get numerical labels numerical_labels = list(X_train._get_numeric_data().columns) categorical_labels = X_train.select_dtypes( include=['object']).columns.tolist() #moving 'MSSubClass' feature from numerical to categorical numerical_labels.remove('MSSubClass') categorical_labels.append('MSSubClass') print(f'Numerical labels are (contains ordinal cat):{numerical_labels}') print(f'Categorical labels are:{categorical_labels}') #print(X_train.head()) num_pipeline = Pipeline([ ('imputer', MeanMedianImputer(imputation_method='median')) #, #('std_scaler',StandardScaler()) ]) cat_pipeline = Pipeline([('imputer', CategoricalImputer(imputation_method='missing', fill_value='Missing')), ('one_hot', OneHotEncoder(top_categories=None, drop_last=False))]) full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels), ('cat', cat_pipeline, categorical_labels)]) X_converted = cat_pipeline.fit_transform(X_train) print(X_converted.head())
def test_non_fitted_error(df_na): with pytest.raises(NotFittedError): imputer = MeanMedianImputer() imputer.transform(df_na)
def test_error_with_wrong_imputation_method(): with pytest.raises(ValueError): MeanMedianImputer(imputation_method="arbitrary")
vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0] print(len(vars_with_na)) X_train[vars_with_na].isnull().mean() # %% Missing values -- Numerical -- add missing indicator. missing_ind = AddMissingIndicator(variables=vars_with_na) missing_ind.fit(X_train) X_train = missing_ind.transform(X_train) X_test = missing_ind.transform(X_test) # check the binary missing indicator variables X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head() # %% # %% Missing values -- Numerical -- add missing indicator. mean_imputer = MeanMedianImputer( imputer_method='mean', variables=vars_with_na ) mean_imputer.fit(X_train) print(mean_imputer.imputer_dict_) X_train = mean_imputer.transform(X_train) X_test = mean_imputer.transform(X_test) # %% Varief whether there are missing value. X_train[cat_vars_with_na].isnull().sum() [var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0] #%% Temporal variables. def elapsed_years(df, var): df[var] = df['YrSold'] - df[var] return df
SmartCorrelatedSelection, ) from feature_engine.timeseries.forecasting import LagFeatures from feature_engine.transformation import ( BoxCoxTransformer, LogTransformer, PowerTransformer, ReciprocalTransformer, YeoJohnsonTransformer, ) from feature_engine.wrappers import SklearnTransformerWrapper # imputation @parametrize_with_checks([ MeanMedianImputer(), ArbitraryNumberImputer(), CategoricalImputer(fill_value=0, ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True),
CategoricalImputer( imputation_method="frequent", variables=config.model_config.categorical_vars_with_na_frequent, ), ), # add missing indicator ( "missing_indicator", AddMissingIndicator( variables=config.model_config.numerical_vars_with_na), ), # impute numerical variables with the mean ( "mean_imputation", MeanMedianImputer( imputation_method="mean", variables=config.model_config.numerical_vars_with_na, ), ), # == TEMPORAL VARIABLES ==== ( "elapsed_time", pp.TemporalVariableTransformer( variables=config.model_config.temporal_vars, reference_variable=config.model_config.ref_var, ), ), ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])), # ==== VARIABLE TRANSFORMATION ===== ("log", LogTransformer(variables=config.model_config.numericals_log_vars)), (
from classification_model.processing import preprocessors as pp from classification_model.processing import features from classification_model.config import config from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier from feature_engine.imputation import MeanMedianImputer, CategoricalImputer from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, CountFrequencyEncoder from feature_engine.discretisation import EqualFrequencyDiscretiser import logging _logger = logging.getLogger(__name__) rf_pipe = Pipeline( [ ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)), ('categorical_impute', CategoricalImputer(imputation_method='missing', variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES)), ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10, variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES, replace_with='Rare')), ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),