def test_set_attributes_from_dict(self, attribute): processor = CategoricalDataProcessor() cleaned_categories = ["a", "b", "c"] params = { "regroup": True, "regroup_name": "Other", "keep_missing": True, "category_size_threshold": 5, "p_value_threshold": 0.001, "scale_contingency_table": True, "forced_categories": {}, "_cleaned_categories_by_column": { "variable": cleaned_categories } } expected = params[attribute] if attribute == "_cleaned_categories_by_column": # list is transformed to a set in CategoricalDataProcessor expected = {"variable": set(cleaned_categories)} processor.set_attributes_from_dict(params) actual = getattr(processor, attribute) assert actual == expected
def test_attributes_to_dict(self): processor = CategoricalDataProcessor() cleaned_categories = ["a", "b", "c"] processor._cleaned_categories_by_column = { "variable": set(cleaned_categories) } actual = processor.attributes_to_dict() expected = { "regroup": True, "regroup_name": "Other", "keep_missing": True, "category_size_threshold": 5, "p_value_threshold": 0.001, "scale_contingency_table": True, "forced_categories": {}, "_cleaned_categories_by_column": { "variable": list(set(cleaned_categories)) } } assert actual == expected
def test_all_cats_not_significant(self): # Expected e = { 'categorical_var': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'], 'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], 'categorical_var_processed': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] } # data -> actual d = { 'categorical_var': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'], 'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0] } discrete_vars = ['categorical_var'] target_column_name = 'target' data = pd.DataFrame(d, columns=['categorical_var', 'target']) expected = pd.DataFrame( e, columns=['categorical_var', 'target', 'categorical_var_processed']) categorical_data_processor = CategoricalDataProcessor( category_size_threshold=0, p_value_threshold=0.0001) categorical_data_processor.fit(data, discrete_vars, target_column_name) actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected)
def test_replace_categories(self, cleaned_categories, expected): data = pd.Series(data=["c1", "c2", "c3", "c4"]) actual = (CategoricalDataProcessor._replace_categories( data, cleaned_categories)) pd.testing.assert_series_equal(actual, expected)
def test_replace_missings(self): data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]}) expected = pd.DataFrame( {"variable": ["c1", "c2", "Missing", "Missing", "Missing"]}) actual = (CategoricalDataProcessor._replace_missings( data, ["variable"])) pd.testing.assert_frame_equal(actual, expected)
def test_force_category(self): # Expected e = { 'categorical_var': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ], 'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], 'categorical_var_processed': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ] } # data -> actual d = { 'categorical_var': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ], 'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0] } discrete_vars = ['categorical_var'] target_column_name = 'target' data = pd.DataFrame(d, columns=['categorical_var', 'target']) expected = pd.DataFrame( e, columns=['categorical_var', 'target', 'categorical_var_processed']) expected['categorical_var_processed'] = ( expected['categorical_var_processed'].astype("category")) categorical_data_processor = CategoricalDataProcessor( category_size_threshold=0, forced_categories={'categorical_var': ['C']}, p_value_threshold=0.05) categorical_data_processor.fit(data, discrete_vars, target_column_name) actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected)
def test_compute_p_value(self, scale_contingency_table, expected): X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10)) y = pd.Series(data=([0] * 35 + [1] * 35 + [0] * 15 + [1] * 5 + [0] * 8 + [1] * 2)) category = "c1" actual = (CategoricalDataProcessor._compute_p_value( X, y, category, scale_contingency_table)) assert pytest.approx(actual, abs=1e-5) == expected
def test_get_small_categories(self): data = pd.Series(data=(["c1"] * 50 + ["c2"] * 25 + ["c3"] * 15 + ["c4"] * 5)) incidence = 0.35 threshold = 10 # to make it easy to manualy compute expected = {"c3", "c4"} actual = (CategoricalDataProcessor._get_small_categories( data, incidence, threshold)) assert actual == expected
def test_compute_p_value_regression(self, seed, expected): np.random.seed(seed) X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10)) y = pd.Series(data=np.random.uniform(0, 1, 100) * 5) category = "c1" actual = (CategoricalDataProcessor._compute_p_value( X, y, category, "regression", None)) assert pytest.approx(actual, abs=1e-5) == expected
def from_pipeline(cls, pipeline: dict): """Constructor to instantiate PreProcessor from a (fitted) pipeline which was stored as a JSON file and passed to this function as a dict. Parameters ---------- pipeline : dict The (fitted) pipeline as a dictionary. Returns ------- PreProcessor Instance of PreProcessor instantiated from a stored pipeline. Raises ------ ValueError If the loaded pipeline does not have all required parameters and no others. """ if not PreProcessor._is_valid_pipeline(pipeline): raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.") categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( pipeline["categorical_data_processor"]) # model_type = categorical_data_processor.model_type discretizer = KBinsDiscretizer() discretizer.set_attributes_from_dict(pipeline["discretizer"]) target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"])
def from_pipeline(cls, pipeline_path: str): """Constructor to instantiate PreProcessor from a (fitted) pipeline, stored as a JSON file. Parameters ---------- pipeline_path : str path to the (fitted) pipeline Returns ------- PreProcessor Instance of PreProcessor instantiated from a stored pipeline Raises ------ ValueError Description """ with open(pipeline_path, "r") as file: pipeline = json.load(file) if not PreProcessor._is_valid_pipeline(pipeline): raise ValueError("Invalid pipeline") # To do: specify error categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( pipeline["categorical_data_processor"] ) discretizer = KBinsDiscretizer() discretizer.set_attributes_from_dict(pipeline["discretizer"]) target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"])
def from_pipeline(cls, pipeline: dict): """Constructor to instantiate PreProcessor from a (fitted) pipeline which was stored as a JSON file and passed to this function as a dict. Parameters ---------- pipeline : dict The (fitted) pipeline as a dictionary Returns ------- PreProcessor Instance of PreProcessor instantiated from a stored pipeline Raises ------ ValueError Description """ if not PreProcessor._is_valid_pipeline(pipeline): raise ValueError("Invalid pipeline") # To do: specify error categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( pipeline["categorical_data_processor"]) discretizer = KBinsDiscretizer() discretizer.set_attributes_from_dict(pipeline["discretizer"]) target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"])
def from_params(cls, n_bins: int = 10, strategy: str = "quantile", closed: str = "right", auto_adapt_bins: bool = False, starting_precision: int = 0, label_format: str = "{} - {}", change_endpoint_format: bool = False, regroup: bool = True, regroup_name: str = "Other", keep_missing: bool = True, category_size_threshold: int = 5, p_value_threshold: float = 0.001, scale_contingency_table: bool = True, forced_categories: dict = {}, weight: float = 0.0, imputation_strategy: str = "mean"): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes along with good default values. Parameters ---------- n_bins : int, optional Number of bins to produce. Raises ValueError if ``n_bins < 2``. strategy : str, optional Binning strategy. Currently only ``uniform`` and ``quantile`` e.g. equifrequency is supported closed : str, optional Whether to close the bins (intervals) from the left or right auto_adapt_bins : bool, optional reduces the number of bins (starting from n_bins) as a function of the number of missings starting_precision : int, optional Initial precision for the bin edges to start from, can also be negative. Given a list of bin edges, the class will automatically choose the minimal precision required to have proper bins e.g. ``[5.5555, 5.5744, ...]`` will be rounded to ``[5.56, 5.57, ...]``. In case of a negative number, an attempt will be made to round up the numbers of the bin edges e.g. ``5.55 -> 10``, ``146 -> 100``, ... label_format : str, optional format string to display the bin labels e.g. ``min - max``, ``(min, max]``, ... change_endpoint_format : bool, optional Whether or not to change the format of the lower and upper bins into ``< x`` and ``> y`` resp. regroup : bool Whether or not to regroup categories regroup_name : str New name of the non-significant regrouped variables keep_missing : bool Whether or not to keep missing as a separate category category_size_threshold : int minimal size of a category to keep it as a separate category p_value_threshold : float Significance threshold for regrouping. forced_categories : dict Map to prevent certain categories from being group into ``Other`` for each column - dict of the form ``{col:[forced vars]}``. scale_contingency_table : bool Whether contingency table should be scaled before chi^2.' weight : float, optional Smoothing parameters (non-negative). The higher the value of the parameter, the bigger the contribution of the overall mean. When set to zero, there is no smoothing (e.g. the pure target incidence is used). imputation_strategy : str, optional in case there is a particular column which contains new categories, the encoding will lead to NULL values which should be imputed. Valid strategies are to replace with the global mean of the train set or the min (resp. max) incidence of the categories of that particular variable. Returns ------- PreProcessor Description """ categorical_data_processor = CategoricalDataProcessor( regroup, regroup_name, keep_missing, category_size_threshold, p_value_threshold, scale_contingency_table, forced_categories) discretizer = KBinsDiscretizer(n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format) target_encoder = TargetEncoder(weight) return cls(categorical_data_processor, discretizer, target_encoder)