Exemple #1
0
    def test_fit_exception(self, strategy, expectation):
        discretizer = KBinsDiscretizer(strategy=strategy)

        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})

        with expectation:
            discretizer.fit(data, ["variable"])
Exemple #2
0
    def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
        discretizer = KBinsDiscretizer(n_bins=n_bins,
                                       auto_adapt_bins=auto_adapt_bins)

        actual = discretizer._fit_column(data, column_name="variable")

        assert actual == expected
Exemple #3
0
    def test_set_attributes_from_dict(self, attribute):

        discretizer = KBinsDiscretizer()

        params = {
            "n_bins": 5,
            "strategy": "uniform",
            "closed": "left",
            "auto_adapt_bins": True,
            "starting_precision": 1,
            "label_format": "[,)",
            "change_endpoint_format": True,
            "_bins_by_column": {
                "variable": [[0.0, 3.0], [3.0, 6.0], [6.0, 9.0]]
            }
        }

        expected = params[attribute]

        if attribute == "_bins_by_column":
            # list of list is transformed to a list of tuples
            # in KBinsDiscretizer!!!
            expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]}

        discretizer.set_attributes_from_dict(params)

        actual = getattr(discretizer, attribute)

        assert actual == expected
Exemple #4
0
    def test_create_bin_labels(self, change_endpoint_format, closed, bins,
                               expected):

        discretizer = KBinsDiscretizer(
            closed=closed, change_endpoint_format=change_endpoint_format)

        actual = discretizer._create_bin_labels(bins)

        assert actual == expected
Exemple #5
0
    def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
                                                    starting_precision,
                                                    expected):

        discretizer = KBinsDiscretizer(starting_precision=starting_precision)

        actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges)

        assert actual == expected
Exemple #6
0
    def test_compute_bin_edges(self, strategy, n_bins, data, expected):

        discretizer = KBinsDiscretizer(strategy=strategy)

        actual = discretizer._compute_bin_edges(data,
                                                column_name="variable",
                                                n_bins=n_bins,
                                                col_min=data.variable.min(),
                                                col_max=data.variable.max())

        assert actual == expected
Exemple #7
0
    def test_transform(self, scenario, expectation):

        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")

        data = pd.DataFrame({"variable": ([1] * 10)})
        expected = data.copy()

        if scenario == "regular_test":
            # overwrite data and expected with DataFrame containing
            # a non-constant variable
            data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
            expected = data.copy()

            discretizer.fit(data, ["variable"])

            categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
            expected["variable_bin"] = pd.Categorical(
                ["0.0 - 3.0"] * 4 + ["3.0 - 6.0"] * 3 + ["6.0 - 9.0"] * 3 +
                ["Missing"],
                categories=categories,
                ordered=True)
        elif scenario == "constant_data":
            discretizer.fit(data, ["variable"])

        with expectation:
            actual = discretizer.transform(data, ["variable"])
            pd.testing.assert_frame_equal(actual, expected)
Exemple #8
0
    def test_transform_column(self):

        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")

        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]

        actual = discretizer._transform_column(data, "variable", bins)

        categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]

        expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
        expected["variable_bin"] = pd.Categorical(
            ["0.0 - 3.0"] * 4 + ["3.0 - 6.0"] * 3 + ["6.0 - 9.0"] * 3 +
            ["Missing"],
            categories=categories,
            ordered=True)

        # assert using pandas testing module
        pd.testing.assert_frame_equal(actual, expected)
Exemple #9
0
    def test_attributes_to_dict(self):

        discretizer = KBinsDiscretizer()

        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
        discretizer._bins_by_column = {"variable": bins}

        actual = discretizer.attributes_to_dict()

        expected = {
            "n_bins": 10,
            "strategy": "quantile",
            "closed": "right",
            "auto_adapt_bins": False,
            "starting_precision": 0,
            "label_format": "{} - {}",
            "change_endpoint_format": False,
            "_bins_by_column": {
                "variable": [[0.0, 3.0], [3.0, 6.0], [6.0, 9.0]]
            }
        }

        assert actual == expected
Exemple #10
0
    def test_compute_bins_from_edges(self, bin_edges, expected):

        discretizer = KBinsDiscretizer()
        actual = discretizer._compute_bins_from_edges(bin_edges)

        assert actual == expected
Exemple #11
0
 def test_validate_n_bins_exception(self, n_bins, expectation):
     with expectation:
         assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None