def test_add_indicators_to_one_variable(df_na): imputer = AddMissingIndicator(variables="Name") X_transformed = imputer.fit_transform(df_na) assert imputer.variables_ == ["Name"] assert X_transformed.shape == (8, 7) assert "Name_na" in X_transformed.columns assert X_transformed["Name_na"].sum() == 2
def test_add_indicators_to_all_variables_when_variables_is_none(df_na): imputer = AddMissingIndicator(missing_only=False, variables=None) X_transformed = imputer.fit_transform(df_na) assert imputer.variables_ == [ "Name", "City", "Studies", "Age", "Marks", "dob" ] assert X_transformed.shape == (8, 12) assert "dob_na" in X_transformed.columns assert X_transformed["dob_na"].sum() == 0
def test_detect_variables_with_missing_data_in_variables_entered_by_user( df_na): imputer = AddMissingIndicator(missing_only=True, variables=["City", "Studies", "Age", "dob"]) X_transformed = imputer.fit_transform(df_na) assert imputer.variables == ["City", "Studies", "Age", "dob"] assert imputer.variables_ == ["City", "Studies", "Age"] assert X_transformed.shape == (8, 9) assert "City_na" in X_transformed.columns assert "dob_na" not in X_transformed.columns assert X_transformed["City_na"].sum() == 2
def test_detect_variables_with_missing_data_when_variables_is_none(df_na): # test case 1: automatically detect variables with missing data imputer = AddMissingIndicator(missing_only=True, variables=None) X_transformed = imputer.fit_transform(df_na) # init params assert imputer.missing_only is True assert imputer.variables is None # fit params assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"] assert imputer.n_features_in_ == 6 # transform outputs assert X_transformed.shape == (8, 11) assert "Name_na" in X_transformed.columns assert X_transformed["Name_na"].sum() == 2
def test_get_feature_names_out_from_pipeline(df_na): original_features = df_na.columns.to_list() tr = Pipeline([("transformer", AddMissingIndicator(missing_only=False))]) tr.fit(df_na) out = [f + "_na" for f in original_features] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=original_features) == out assert tr.get_feature_names_out( input_features=original_features[0:2]) == out[0:2] assert tr.get_feature_names_out(input_features=["Name"]) == ["Name_na"]
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
def test_get_feature_names_out(df_na): original_features = df_na.columns.to_list() tr = AddMissingIndicator(missing_only=False) tr.fit(df_na) out = [f + "_na" for f in original_features] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=original_features) == out assert tr.get_feature_names_out( input_features=original_features[0:2]) == out[0:2] assert tr.get_feature_names_out(input_features=["Name"]) == ["Name_na"] tr = AddMissingIndicator(missing_only=True) tr.fit(df_na) out = [f + "_na" for f in original_features[0:-1]] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=original_features) == out with pytest.raises(ValueError): tr.get_feature_names_out("Name") with pytest.raises(ValueError): tr.get_feature_names_out(["Name", "hola"])
def test_error_when_missing_only_not_bool(): with pytest.raises(ValueError): AddMissingIndicator(missing_only="missing_only")
X_train = cat_imputer_frequent.transform(X_train) X_test = cat_imputer_frequent.transform(X_test) # %% Varief whether there are missing value. X_train[cat_vars_with_na].isnull().sum() [var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0] # %% Numerical variables. num_vars = [var for var in X_train.columns if var not in cat_vars and var != 'SalePrice'] vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0] print(len(vars_with_na)) X_train[vars_with_na].isnull().mean() # %% Missing values -- Numerical -- add missing indicator. missing_ind = AddMissingIndicator(variables=vars_with_na) missing_ind.fit(X_train) X_train = missing_ind.transform(X_train) X_test = missing_ind.transform(X_test) # check the binary missing indicator variables X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head() # %% # %% Missing values -- Numerical -- add missing indicator. mean_imputer = MeanMedianImputer( imputer_method='mean', variables=vars_with_na ) mean_imputer.fit(X_train) print(mean_imputer.imputer_dict_) X_train = mean_imputer.transform(X_train)
BoxCoxTransformer, LogTransformer, PowerTransformer, ReciprocalTransformer, YeoJohnsonTransformer, ) from feature_engine.wrappers import SklearnTransformerWrapper # imputation @parametrize_with_checks([ MeanMedianImputer(), ArbitraryNumberImputer(), CategoricalImputer(fill_value=0, ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder(
def test_non_fitted_error(df_na): with pytest.raises(NotFittedError): imputer = AddMissingIndicator() imputer.transform(df_na)
CategoricalImputer( imputation_method="missing", variables=config.model_config.categorical_vars_with_na_missing, ), ), ( "frequent_imputation", CategoricalImputer( imputation_method="frequent", variables=config.model_config.categorical_vars_with_na_frequent, ), ), # add missing indicator ( "missing_indicator", AddMissingIndicator( variables=config.model_config.numerical_vars_with_na), ), # impute numerical variables with the mean ( "mean_imputation", MeanMedianImputer( imputation_method="mean", variables=config.model_config.numerical_vars_with_na, ), ), # == TEMPORAL VARIABLES ==== ( "elapsed_time", pp.TemporalVariableTransformer( variables=config.model_config.temporal_vars, reference_variable=config.model_config.ref_var,