예제 #1
0
    def test_set_attributes_from_dict(self, attribute):

        processor = CategoricalDataProcessor()

        cleaned_categories = ["a", "b", "c"]
        params = {
            "regroup": True,
            "regroup_name": "Other",
            "keep_missing": True,
            "category_size_threshold": 5,
            "p_value_threshold": 0.001,
            "scale_contingency_table": True,
            "forced_categories": {},
            "_cleaned_categories_by_column": {
                "variable": cleaned_categories
            }
        }

        expected = params[attribute]

        if attribute == "_cleaned_categories_by_column":
            # list is transformed to a set in CategoricalDataProcessor
            expected = {"variable": set(cleaned_categories)}

        processor.set_attributes_from_dict(params)

        actual = getattr(processor, attribute)

        assert actual == expected
예제 #2
0
    def test_attributes_to_dict(self):

        processor = CategoricalDataProcessor()

        cleaned_categories = ["a", "b", "c"]
        processor._cleaned_categories_by_column = {
            "variable": set(cleaned_categories)
        }

        actual = processor.attributes_to_dict()

        expected = {
            "regroup": True,
            "regroup_name": "Other",
            "keep_missing": True,
            "category_size_threshold": 5,
            "p_value_threshold": 0.001,
            "scale_contingency_table": True,
            "forced_categories": {},
            "_cleaned_categories_by_column": {
                "variable": list(set(cleaned_categories))
            }
        }

        assert actual == expected
    def test_all_cats_not_significant(self):
        # Expected
        e = {
            'categorical_var':
            ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
            'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0],
            'categorical_var_processed':
            ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C']
        }

        # data -> actual
        d = {
            'categorical_var':
            ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
            'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0]
        }

        discrete_vars = ['categorical_var']
        target_column_name = 'target'

        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
        expected = pd.DataFrame(
            e,
            columns=['categorical_var', 'target', 'categorical_var_processed'])

        categorical_data_processor = CategoricalDataProcessor(
            category_size_threshold=0, p_value_threshold=0.0001)

        categorical_data_processor.fit(data, discrete_vars, target_column_name)

        actual = categorical_data_processor.transform(data, discrete_vars)

        pd.testing.assert_frame_equal(actual, expected)
예제 #4
0
    def test_replace_categories(self, cleaned_categories, expected):

        data = pd.Series(data=["c1", "c2", "c3", "c4"])

        actual = (CategoricalDataProcessor._replace_categories(
            data, cleaned_categories))

        pd.testing.assert_series_equal(actual, expected)
예제 #5
0
    def test_replace_missings(self):

        data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]})
        expected = pd.DataFrame(
            {"variable": ["c1", "c2", "Missing", "Missing", "Missing"]})
        actual = (CategoricalDataProcessor._replace_missings(
            data, ["variable"]))

        pd.testing.assert_frame_equal(actual, expected)
    def test_force_category(self):
        # Expected
        e = {
            'categorical_var': [
                'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B',
                'C', 'C', 'C', 'C', 'C', 'C'
            ],
            'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0],
            'categorical_var_processed': [
                'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B',
                'C', 'C', 'C', 'C', 'C', 'C'
            ]
        }

        # data -> actual
        d = {
            'categorical_var': [
                'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B',
                'C', 'C', 'C', 'C', 'C', 'C'
            ],
            'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]
        }

        discrete_vars = ['categorical_var']
        target_column_name = 'target'

        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
        expected = pd.DataFrame(
            e,
            columns=['categorical_var', 'target', 'categorical_var_processed'])

        expected['categorical_var_processed'] = (
            expected['categorical_var_processed'].astype("category"))

        categorical_data_processor = CategoricalDataProcessor(
            category_size_threshold=0,
            forced_categories={'categorical_var': ['C']},
            p_value_threshold=0.05)

        categorical_data_processor.fit(data, discrete_vars, target_column_name)

        actual = categorical_data_processor.transform(data, discrete_vars)

        pd.testing.assert_frame_equal(actual, expected)
예제 #7
0
    def test_compute_p_value(self, scale_contingency_table, expected):

        X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10))
        y = pd.Series(data=([0] * 35 + [1] * 35 + [0] * 15 + [1] * 5 +
                            [0] * 8 + [1] * 2))
        category = "c1"

        actual = (CategoricalDataProcessor._compute_p_value(
            X, y, category, scale_contingency_table))

        assert pytest.approx(actual, abs=1e-5) == expected
예제 #8
0
    def test_get_small_categories(self):

        data = pd.Series(data=(["c1"] * 50 + ["c2"] * 25 + ["c3"] * 15 +
                               ["c4"] * 5))
        incidence = 0.35
        threshold = 10  # to make it easy to manualy compute
        expected = {"c3", "c4"}

        actual = (CategoricalDataProcessor._get_small_categories(
            data, incidence, threshold))

        assert actual == expected
예제 #9
0
    def test_compute_p_value_regression(self, seed, expected):

        np.random.seed(seed)

        X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10))
        y = pd.Series(data=np.random.uniform(0, 1, 100) * 5)
        category = "c1"

        actual = (CategoricalDataProcessor._compute_p_value(
            X, y, category, "regression", None))

        assert pytest.approx(actual, abs=1e-5) == expected
예제 #10
0
    def from_pipeline(cls, pipeline: dict):
        """Constructor to instantiate PreProcessor from a (fitted) pipeline
        which was stored as a JSON file and passed to this function as a dict.

        Parameters
        ----------
        pipeline : dict
            The (fitted) pipeline as a dictionary.

        Returns
        -------
        PreProcessor
            Instance of PreProcessor instantiated from a stored pipeline.

        Raises
        ------
        ValueError
            If the loaded pipeline does not have all required parameters
            and no others.
        """

        if not PreProcessor._is_valid_pipeline(pipeline):
            raise ValueError("Invalid pipeline, as it does not "
                             "contain all and only the required parameters.")

        categorical_data_processor = CategoricalDataProcessor()
        categorical_data_processor.set_attributes_from_dict(
            pipeline["categorical_data_processor"])
        # model_type = categorical_data_processor.model_type

        discretizer = KBinsDiscretizer()
        discretizer.set_attributes_from_dict(pipeline["discretizer"])

        target_encoder = TargetEncoder()
        target_encoder.set_attributes_from_dict(pipeline["target_encoder"])

        return cls(categorical_data_processor,
                   discretizer,
                   target_encoder,
                   is_fitted=pipeline["_is_fitted"])
예제 #11
0
    def from_pipeline(cls, pipeline_path: str):
        """Constructor to instantiate PreProcessor from a (fitted) pipeline,
        stored as a JSON file.

        Parameters
        ----------
        pipeline_path : str
            path to the (fitted) pipeline

        Returns
        -------
        PreProcessor
            Instance of PreProcessor instantiated from a stored pipeline

        Raises
        ------
        ValueError
            Description
        """
        with open(pipeline_path, "r") as file:
            pipeline = json.load(file)

        if not PreProcessor._is_valid_pipeline(pipeline):
            raise ValueError("Invalid pipeline")  # To do: specify error

        categorical_data_processor = CategoricalDataProcessor()
        categorical_data_processor.set_attributes_from_dict(
            pipeline["categorical_data_processor"]
        )

        discretizer = KBinsDiscretizer()
        discretizer.set_attributes_from_dict(pipeline["discretizer"])

        target_encoder = TargetEncoder()
        target_encoder.set_attributes_from_dict(pipeline["target_encoder"])

        return cls(categorical_data_processor, discretizer, target_encoder,
                   is_fitted=pipeline["_is_fitted"])
예제 #12
0
    def from_pipeline(cls, pipeline: dict):
        """Constructor to instantiate PreProcessor from a (fitted) pipeline
        which was stored as a JSON file and passed to this function as a dict.

        Parameters
        ----------
        pipeline : dict
            The (fitted) pipeline as a dictionary

        Returns
        -------
        PreProcessor
            Instance of PreProcessor instantiated from a stored pipeline

        Raises
        ------
        ValueError
            Description
        """

        if not PreProcessor._is_valid_pipeline(pipeline):
            raise ValueError("Invalid pipeline")  # To do: specify error

        categorical_data_processor = CategoricalDataProcessor()
        categorical_data_processor.set_attributes_from_dict(
            pipeline["categorical_data_processor"])

        discretizer = KBinsDiscretizer()
        discretizer.set_attributes_from_dict(pipeline["discretizer"])

        target_encoder = TargetEncoder()
        target_encoder.set_attributes_from_dict(pipeline["target_encoder"])

        return cls(categorical_data_processor,
                   discretizer,
                   target_encoder,
                   is_fitted=pipeline["_is_fitted"])
예제 #13
0
    def from_params(cls,
                    n_bins: int = 10,
                    strategy: str = "quantile",
                    closed: str = "right",
                    auto_adapt_bins: bool = False,
                    starting_precision: int = 0,
                    label_format: str = "{} - {}",
                    change_endpoint_format: bool = False,
                    regroup: bool = True,
                    regroup_name: str = "Other",
                    keep_missing: bool = True,
                    category_size_threshold: int = 5,
                    p_value_threshold: float = 0.001,
                    scale_contingency_table: bool = True,
                    forced_categories: dict = {},
                    weight: float = 0.0,
                    imputation_strategy: str = "mean"):
        """Constructor to instantiate PreProcessor from all the parameters
        that can be set in all its required (attribute) classes
        along with good default values.

        Parameters
        ----------
        n_bins : int, optional
            Number of bins to produce. Raises ValueError if ``n_bins < 2``.
        strategy : str, optional
            Binning strategy. Currently only ``uniform`` and ``quantile``
            e.g. equifrequency is supported
        closed : str, optional
            Whether to close the bins (intervals) from the left or right
        auto_adapt_bins : bool, optional
            reduces the number of bins (starting from n_bins) as a function of
            the number of missings
        starting_precision : int, optional
            Initial precision for the bin edges to start from,
            can also be negative. Given a list of bin edges, the class will
            automatically choose the minimal precision required to have proper
            bins e.g. ``[5.5555, 5.5744, ...]`` will be rounded
            to ``[5.56, 5.57, ...]``. In case of a negative number, an attempt
            will be made to round up the numbers of the bin edges
            e.g. ``5.55 -> 10``, ``146 -> 100``, ...
        label_format : str, optional
            format string to display the bin labels
            e.g. ``min - max``, ``(min, max]``, ...
        change_endpoint_format : bool, optional
            Whether or not to change the format of the lower and upper bins
            into ``< x`` and ``> y`` resp.
        regroup : bool
            Whether or not to regroup categories
        regroup_name : str
            New name of the non-significant regrouped variables
        keep_missing : bool
            Whether or not to keep missing as a separate category
        category_size_threshold : int
            minimal size of a category to keep it as a separate category
        p_value_threshold : float
            Significance threshold for regrouping.
        forced_categories : dict
            Map to prevent certain categories from being group into ``Other``
            for each column - dict of the form ``{col:[forced vars]}``.
        scale_contingency_table : bool
            Whether contingency table should be scaled before chi^2.'
        weight : float, optional
            Smoothing parameters (non-negative). The higher the value of the
            parameter, the bigger the contribution of the overall mean.
            When set to zero, there is no smoothing
            (e.g. the pure target incidence is used).
        imputation_strategy : str, optional
            in case there is a particular column which contains new categories,
            the encoding will lead to NULL values which should be imputed.
            Valid strategies are to replace with the global mean of the train
            set or the min (resp. max) incidence of the categories of that
            particular variable.

        Returns
        -------
        PreProcessor
            Description
        """
        categorical_data_processor = CategoricalDataProcessor(
            regroup, regroup_name, keep_missing, category_size_threshold,
            p_value_threshold, scale_contingency_table, forced_categories)
        discretizer = KBinsDiscretizer(n_bins, strategy, closed,
                                       auto_adapt_bins, starting_precision,
                                       label_format, change_endpoint_format)

        target_encoder = TargetEncoder(weight)

        return cls(categorical_data_processor, discretizer, target_encoder)