def test_automatically_find_variables_and_return_as_numeric(df_normal_dist): # test case 1: automatically select variables, return_object=False transformer = EqualWidthDiscretiser(bins=10, variables=None, return_object=False) X = transformer.fit_transform(df_normal_dist) # fit parameters _, bins = pd.cut(x=df_normal_dist["var"], bins=10, retbins=True, duplicates="drop") bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") # transform output X_t = [x for x in range(0, 10)] val_counts = [18, 17, 16, 13, 11, 7, 7, 5, 5, 1] # init params assert transformer.bins == 10 assert transformer.variables == ["var"] assert transformer.return_object is False # fit params assert transformer.input_shape_ == (100, 1) # transform params assert (transformer.binner_dict_["var"] == bins).all() assert len([x for x in X["var"].unique() if x not in X_t]) == 0 # in equal width discretisation, intervals get different number of values assert len([x for x in X["var"].value_counts() if x not in val_counts]) == 0
def test_custom_models_template(scenario_with_custom_models_template): aml, pipeline, param_grid = scenario_with_custom_models_template final_pipes = aml._make_aml_combinations(pipeline, param_grid) check = [Pipeline(steps=[('disc1', EqualFrequencyDiscretiser()), ('model1', LinearRegression())]), Pipeline(steps=[('disc1', EqualFrequencyDiscretiser()), ('model2', RandomForestRegressor())]), Pipeline(steps=[('disc2', EqualWidthDiscretiser()), ('model1', LinearRegression())]), Pipeline(steps=[('disc2', EqualWidthDiscretiser()), ('model2', RandomForestRegressor())])] assert str(final_pipes) == str(check)
def scenario_with_default_models_template(): pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()), ('disc2', EqualWidthDiscretiser()), aml_basic_regressors[:2]]) param_grid = {} aml = AMLGridSearchCV(pipeline, param_grid) return aml, pipeline, param_grid
def scenario_with_grid_search_for_one_model(): pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()), ('disc2', EqualWidthDiscretiser()), ('model1', LinearRegression()), ('model2', RandomForestRegressor())]) param_grid = {'disc1__q': [5, 15], 'model2__*': []} aml = AMLGridSearchCV(pipeline, param_grid) return aml, pipeline, param_grid
def scenario_with_custom_models_template(): regressors = [('model1', LinearRegression()), ('model2', RandomForestRegressor())] pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()), ('disc2', EqualWidthDiscretiser()), regressors]) param_grid = {} aml = AMLGridSearchCV(pipeline, param_grid) return aml, pipeline, param_grid
def scenario_without_params(): pipeline = Pipeline([('disc1', EqualFrequencyDiscretiser()), ('disc2', EqualWidthDiscretiser()), ('model1', LinearRegression()), ('model2', RandomForestRegressor())]) param_grid = {} aml = AMLGridSearchCV(pipeline, param_grid) return aml, pipeline, param_grid
def _make_discretiser(self): """ Instantiate the EqualWidthDiscretiser or EqualFrequencyDiscretiser. """ if self.strategy == "equal_width": discretiser = EqualWidthDiscretiser( bins=self.bins, variables=self.variables_numerical_, return_boundaries=True, ) else: discretiser = EqualFrequencyDiscretiser( q=self.bins, variables=self.variables_numerical_, return_boundaries=True, ) return discretiser
def _make_numerical_pipeline(self): if self.strategy == "equal_width": discretizer = EqualWidthDiscretiser( bins=self.bins, variables=self.variables_numerical_, return_object=True ) else: discretizer = EqualFrequencyDiscretiser( q=self.bins, variables=self.variables_numerical_, return_object=True ) encoder = MeanEncoder(variables=self.variables_numerical_) _pipeline_numerical = Pipeline( [ ("discretization", discretizer), ("encoder", encoder), ] ) return _pipeline_numerical
def test_4th_step_in_scenario_without_params(scenario_without_params): aml, pipeline, param_grid = scenario_without_params final_pipes = aml._make_aml_combinations(pipeline, param_grid) check = [('disc2', EqualWidthDiscretiser()), ('model2', RandomForestRegressor())] assert str(final_pipes[3].steps) == str(check)
def test_non_fitted_error(df_vartypes): with pytest.raises(NotFittedError): transformer = EqualWidthDiscretiser() transformer.transform(df_vartypes)
def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): transformer = EqualWidthDiscretiser() transformer.fit(df_vartypes) transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_error_if_input_df_contains_na_in_fit(df_na): # test case 3: when dataset contains na, fit method with pytest.raises(ValueError): transformer = EqualWidthDiscretiser() transformer.fit(df_na)
def test_error_if_return_object_not_bool(): with pytest.raises(ValueError): EqualWidthDiscretiser(return_object="other")
def test_error_when_bins_not_number(): with pytest.raises(ValueError): EqualWidthDiscretiser(bins="other")
def test_automatically_find_variables_and_return_as_object(df_normal_dist): transformer = EqualWidthDiscretiser(bins=10, variables=None, return_object=True) X = transformer.fit_transform(df_normal_dist) assert X["var"].dtypes == "O"
def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find features with high PSI values. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. y : pandas series. Default = None y is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # If required exclude variables that are not in the input dataframe self._confirm_variables(X) # find numerical variables or check those entered are present in the dataframe self.variables_ = _find_or_check_numerical_variables( X, self.variables_) # Remove the split_col from the variables list. It might be added if the # variables are not defined at initialization. if self.split_col in self.variables_: self.variables_.remove(self.split_col) if self.missing_values == "raise": # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) # Split the dataframe into basis and test. basis_df, test_df = self._split_dataframe(X) # Check the shape of the returned dataframes for PSI calculations. # The number of observations must be at least equal to the # number of bins. if min(basis_df.shape[0], test_df.shape[0]) < self.bins: raise ValueError( "The number of rows in the basis and test datasets that will be used " f"in the PSI calculations must be at least larger than {self.bins}. " "After slitting the original dataset based on the given cut_off or" f"split_frac we have {basis_df.shape[0]} samples in the basis set, " f"and {test_df.shape[0]} samples in the test set. " "Please adjust the value of the cut_off or split_frac.") # Switch basis and test dataframes if required. if self.switch: test_df, basis_df = basis_df, test_df # set up the discretizer if self.strategy == "equal_width": bucketer = EqualWidthDiscretiser(bins=self.bins) else: bucketer = EqualFrequencyDiscretiser(q=self.bins) # Compute the PSI by looping over the features self.psi_values_ = {} self.features_to_drop_ = [] for feature in self.variables_: # Discretize the features. basis_discrete = bucketer.fit_transform(basis_df[[feature ]].dropna()) test_discrete = bucketer.transform(test_df[[feature]].dropna()) # Determine percentage of observations per bin basis_distrib, test_distrib = self._observation_frequency_per_bin( basis_discrete, test_discrete) # Calculate the PSI value self.psi_values_[feature] = np.sum( (test_distrib - basis_distrib) * np.log(test_distrib / basis_distrib)) # Assess if feature should be dropped if self.psi_values_[feature] > self.threshold: self.features_to_drop_.append(feature) # save input features self._get_feature_names_in(X) return self
import numpy as np import pytest from sklearn.utils.estimator_checks import check_estimator from feature_engine.discretisation import ( ArbitraryDiscretiser, DecisionTreeDiscretiser, EqualFrequencyDiscretiser, EqualWidthDiscretiser, ) from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [ DecisionTreeDiscretiser(regression=False), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), ] @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_sklearn(estimator): return check_estimator(estimator) @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_feature_engine(estimator): if estimator.__class__.__name__ == "ArbitraryDiscretiser": estimator.set_params(binning_dict={"var_1": [-np.Inf, 0, np.Inf]}) return check_feature_engine_estimator(estimator)
min_lr=0.01) early_stop = EarlyStopping(monitor='val_loss', mode='min', min_delta=0, verbose=1, patience=20) pump_pipeline = Pipeline( steps=[("feature_to_keeper", pp.FeatureKeeper(variables_to_keep=config.VARIABLES_TO_KEEP)), ("missing_imputer", pp.MissingImputer(numerical_variables=config.NUMERICAL_VARIABLES)), ("yeoJohnson", YeoJohnsonTransformer(variables=config.YEO_JHONSON_VARIABLES)), ("discretization", EqualWidthDiscretiser(bins=5, variables=config.NUMERICAL_VARIABLES) ), ("categorical_grouper", pp.CategoricalGrouping(config_dict=config.VARIABLES_TO_GROUP)), ("rareCategories_grouper", pp.RareCategoriesGrouping(threshold=config.VARIABLES_THRESHOLD)), ("one_hot_encoder", OneHotEncoder(variables=config.REAL_CATEGORICAL_VARIABLES, drop_last=False)), ("scaler", MinMaxScaler()), ("model", KerasClassifier(build_fn=create_model, epochs=1, validation_split=0.2, batch_size=256, verbose=1, callbacks=[early_stop, reduce_lr],