Esempi in Python per Discretiser, esempi in Python per causalnex.discretiser.Discretiser

Esempio n. 1

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_outlier_upper_percentile(self):
        """the split point for upper outliers should be at range - provided percentile"""

        arr = np.array(range(100001))
        np.random.shuffle(arr)
        d = Discretiser(method="outlier", outlier_percentile=0.2)
        d.fit(arr)
        assert d.numeric_split_points[1] == 80000

Esempio n. 2

0

Mostra file

    def test_fit_uniform_data(self):
        """Fitting uniform data should produce uniform splits"""

        arr = np.array(range(100001))
        np.random.shuffle(arr)
        d = Discretiser(method="quantile", num_buckets=4)
        d.fit(arr)
        assert np.array_equal([25000, 50000, 75000], d.numeric_split_points)

Esempio n. 3

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_fit_uniform_data(self):
        """Fitting uniform data should produce expected percentile splits of uniform distribution"""

        arr = np.array(range(100001))
        np.random.shuffle(arr)
        d = Discretiser(method="percentiles", percentile_split_points=[0.1, 0.4, 0.85])
        d.fit(arr)
        assert np.array_equal([10000, 40000, 85000], d.numeric_split_points)

Esempio n. 4

0

Mostra file

File: test_preprocessing.py Progetto: zeta1999/causalnex

    def test_fit_creates_exactly_uniform_splits_when_possible(self):
        """splits should be exactly uniform if possible"""

        arr = np.array(range(20))
        np.random.shuffle(arr)
        d = Discretiser(method="uniform", num_buckets=4)
        d.fit(arr)
        for n in range(2):
            assert 4 < (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 5

Esempio n. 5

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_fixed_split_points_monotonic(self):
        """a value error should be raised if numeric split points are not monotonically increasing"""

        Discretiser(method="fixed", numeric_split_points=[-1, -0, 0, 1])
        with pytest.raises(
            ValueError,
            match="numeric_split_points must be monotonically increasing",
        ):
            Discretiser(method="fixed", numeric_split_points=[1, -1])

Esempio n. 6

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_outlier_lt_half(self):
        """a value error should be raised if outlier is not < 0.5"""

        Discretiser(method="outlier", outlier_percentile=0.49)
        with pytest.raises(
            ValueError,
            match="outlier_percentile must be between 0 and 0.5",
        ):
            Discretiser(method="outlier", outlier_percentile=0.5)

Esempio n. 7

0

Mostra file

    def test_transform_smaller_than_fit_range_goes_into_first_bucket(self):
        """If a value smaller than the input is transformed, then it
        should go into the minimum bucket"""

        arr = np.array([n + 1 for n in range(10)])
        np.random.shuffle(arr)
        d = Discretiser(method="uniform", num_buckets=4)
        d.fit(arr)
        assert np.array_equal([0], d.transform(np.array([-101])))

Esempio n. 8

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_percentile_split_points_monotonic(self):
        """a value error should be raised if percentile split points are not monotonically increasing"""

        Discretiser(method="percentiles", percentile_split_points=[0, -0, 0.1, 1])
        with pytest.raises(
            ValueError,
            match="percentile_split_points must be monotonically increasing",
        ):
            Discretiser(method="percentiles", percentile_split_points=[1, 0.1])

Esempio n. 9

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_fit_gauss_data(self):
        """Fitting gauss data should produce percentile splits of standard normal distribution"""

        arr = np.random.normal(loc=0, scale=1, size=100001)
        np.random.shuffle(arr)
        d = Discretiser(method="percentiles", percentile_split_points=[0.1, 0.4, 0.85])
        d.fit(arr)
        assert math.isclose(-1.2815, d.numeric_split_points[0], abs_tol=0.025)
        assert math.isclose(-0.253, d.numeric_split_points[1], abs_tol=0.025)
        assert math.isclose(1.036, d.numeric_split_points[2], abs_tol=0.025)

Esempio n. 10

0

Mostra file

    def test_fit_gauss_data(self):
        """Fitting gauss data should produce standard percentiles splits"""

        arr = np.random.normal(loc=0, scale=1, size=100001)
        np.random.shuffle(arr)
        d = Discretiser(method="quantile", num_buckets=4)
        d.fit(arr)
        assert math.isclose(-0.675, d.numeric_split_points[0], abs_tol=0.025)
        assert math.isclose(0, d.numeric_split_points[1], abs_tol=0.025)
        assert math.isclose(0.675, d.numeric_split_points[2], abs_tol=0.025)

Esempio n. 11

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_percentile_geq_zero(self):
        """a value error should be raised if not all percentiles split points >= 0"""

        Discretiser(method="percentiles", percentile_split_points=[-0.0, 0.0, 0.0001])
        with pytest.raises(
            ValueError,
            match="percentile_split_points must be between 0 and 1",
        ):
            Discretiser(
                method="percentiles", percentile_split_points=[-0.0000001, 0.0001]
            )

Esempio n. 12

0

Mostra file

    def test_percentile_leq_1(self):
        """a value error should be raised if not all percentile split points <= 1"""

        Discretiser(method="percentiles", percentile_split_points=[0.0001, 1])
        with pytest.raises(
                ValueError,
                match="{0} must be between 0 and 1".format(
                    "percentile_split_points"),
        ):
            Discretiser(method="percentiles",
                        percentile_split_points=[0.0001, 1.0000001])

Esempio n. 13

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_outlier_geq_zero(self):
        """a value error should be raised if outlier is not >= 0"""

        Discretiser(method="outlier", outlier_percentile=0.0)
        Discretiser(method="outlier", outlier_percentile=-0.0)
        Discretiser(method="outlier", outlier_percentile=0.1)
        with pytest.raises(
            ValueError,
            match="outlier_percentile must be between 0 and 0.5",
        ):
            Discretiser(method="outlier", outlier_percentile=-0.0000001)

Esempio n. 14

0

Mostra file

    def test_transform_gauss(self):
        """Fitting gauss data should transform to predictable buckets"""

        arr = np.random.normal(loc=0, scale=1, size=1000000)
        np.random.shuffle(arr)
        d = Discretiser(method="quantile", num_buckets=4)
        d.fit(arr)
        unique, counts = np.unique(d.transform(arr), return_counts=True)
        # check all 4 buckets are used
        assert np.array_equal([0, 1, 2, 3], unique)
        assert np.array_equal([250000 for n in range(4)], counts)

Esempio n. 15

0

Mostra file

File: test_preprocessing.py Progetto: zeta1999/causalnex

    def test_fit_creates_close_to_uniform_splits_when_uniform_not_possible(self):
        """splits should be close to uniform if uniform is not possible"""

        arr = np.array(range(9))
        np.random.shuffle(arr)
        d = Discretiser(method="uniform", num_buckets=4)
        d.fit(arr)

        assert len(d.numeric_split_points) == 3
        for n in range(2):
            assert 2 <= (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 3

Esempio n. 16

0

Mostra file

    def test_fit_does_not_attempt_to_deal_with_identical_split_points(self):
        """if all data is identical, and num_buckets>1, then this is not possible.
        In this case the standard behaviour of numpy is followed, and many identical
        splits will be created. See transform for how these are applied"""

        arr = np.array([1 for _ in range(20)])
        d = Discretiser(method="uniform", num_buckets=4)
        d.fit(arr)
        assert np.array_equal(
            np.array([d.numeric_split_points[0] for _ in range(3)]),
            d.numeric_split_points,
        )

Esempio n. 17

0

Mostra file

    def test_transform_uniform(self):
        """Fitting uniform data should transform to predictable buckets"""

        arr = np.array(range(100001))
        np.random.shuffle(arr)
        d = Discretiser(method="percentiles",
                        percentile_split_points=[0.10, 0.40, 0.85])
        d.fit(arr)
        unique, counts = np.unique(d.transform(arr), return_counts=True)
        # check all 4 buckets are used
        assert np.array_equal([0, 1, 2, 3], unique)
        assert np.array_equal([10000, 30000, 45000, 15001], counts)

Esempio n. 18

0

Mostra file

    def test_transform_outlier(self):
        """transforming outliers should put the expected amount of data in each bucket"""

        arr = np.array(range(100001))
        np.random.shuffle(arr)
        d = Discretiser(method="outlier", outlier_percentile=0.2)
        d.fit(arr)
        unique, counts = np.unique(d.transform(arr), return_counts=True)
        # check all 3 buckets are used
        assert np.array_equal([0, 1, 2], unique)
        # check largest difference in outliers is 1
        print(counts)
        assert np.abs(counts[0] - counts[2]) <= 1

Esempio n. 19

0

Mostra file

File: test_preprocessing.py Progetto: zeta1999/causalnex

    def test_transform_uneven_split(self):
        """Data that cannot be split evenly between buckets should be transformed
        into near-even buckets"""

        arr = np.array([n + 1 for n in range(10)])
        np.random.shuffle(arr)
        d = Discretiser(method="uniform", num_buckets=4)
        d.fit(arr)
        unique, counts = np.unique(d.transform(arr), return_counts=True)
        # check all 4 buckets are used
        assert np.array_equal([0, 1, 2, 3], unique)
        # check largest difference in distribution is 1 item
        assert (np.max(counts) - np.min(counts)) <= 1

Esempio n. 20

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_fixed_split_points(self):
        """a value error should be raised if method=fixed and no numeric split points are provided"""

        selected_method = "fixed"
        with pytest.raises(
            ValueError,
            match=f"{selected_method} method expects numeric_split_points",
        ):
            Discretiser(method=selected_method)

Esempio n. 21

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_quantile_requires_num_buckets(self):
        """a value error should be raised if method=quantile and num_buckets is not provided"""

        selected_method = "quantile"
        with pytest.raises(
            ValueError,
            match=f"{selected_method} method expects num_buckets",
        ):
            Discretiser(method=selected_method)

Esempio n. 22

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_outlier_requires_outlier_percentile(self):
        """a value error should be raised if method=outlier and outlier_percentile is not provided"""

        selected_method = "outlier"
        with pytest.raises(
            ValueError,
            match=f"{selected_method} method expects outlier_percentile",
        ):
            Discretiser(method=selected_method)

Esempio n. 23

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_percentile_requires_percentile_split_points(self):
        """a value error should be raised if method=percentiles and no percentile split points are provided"""

        selected_method = "percentiles"
        with pytest.raises(
            ValueError,
            match=f"{selected_method} method expects percentile_split_points",
        ):
            Discretiser(method=selected_method)

Esempio n. 24

0

Mostra file

File: test_preprocessing.py Progetto: zeta1999/causalnex

    def test_uniform_requires_num_buckets(self):
        """a value error should be raised if method=uniform and num_buckets is not provided"""

        selected_method = "uniform"
        with pytest.raises(
            ValueError,
            match="{0} method expects {1}".format(selected_method, "num_buckets"),
        ):
            Discretiser(method=selected_method)

Esempio n. 25

0

Mostra file

    def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Helper method to discretise input data using parameters in
        `discretiser_kwargs` and `discretiser_alg`.
        The splitting thresholds are extracted from the training data

        Args:
            X (pd.DataFrame): a dataframe to be discretised

        Returns:
            a discretised version of the input dataframe
        """

        X = X.copy()

        for col in self.discretiser_alg.keys():

            if self.discretiser_alg[col] == "unsupervised":

                if self.discretiser_kwargs[col]["method"] == "fixed":
                    X[col] = Discretiser(
                        **self.discretiser_kwargs[col]).transform(
                            X[col].values)
                else:
                    discretiser = Discretiser(
                        **self.discretiser_kwargs[col]).fit(
                            self._discretise_data[col].values)
                    X[col] = discretiser.transform(X[col].values)

            else:
                if self.discretiser_alg[col] == "tree":
                    discretiser = DecisionTreeSupervisedDiscretiserMethod(
                        mode="single",
                        tree_params=self.discretiser_kwargs[col])

                elif self.discretiser_alg[col] == "mdlp":
                    discretiser = MDLPSupervisedDiscretiserMethod(
                        self.discretiser_kwargs[col])

                discretiser.fit(
                    dataframe=self._discretise_data,
                    feat_names=[col],
                    target=self._target_name,
                    target_continuous=False,
                )

                X[col] = discretiser.transform(X[[col]])

        return X

Esempio n. 26

0

Mostra file

File: test_preprocessing.py Progetto: quantumblacklabs/causalnex

    def test_invalid_method(self):
        """a value error should be raised if an invalid method is given"""

        allowed_methods = ["uniform", "quantile", "outlier", "fixed", "percentiles"]
        selected_method = "INVALID"
        with pytest.raises(
            ValueError,
            match=f"{selected_method} is not a recognised method. "
            f"Use one of: {' '.join(allowed_methods)}",
        ):
            Discretiser(method=selected_method)

Esempio n. 27

0

Mostra file

File: conftest.py Progetto: quantumblacklabs/causalnex

def iris_test_data() -> pd.DataFrame:
    """
    Iris dataset to test sklearn wrappers
    """
    iris = load_iris()
    X, y = iris["data"], iris["target"]
    names = iris["feature_names"]
    df = pd.DataFrame(X, columns=names)
    df["type"] = y
    df["sepal length (cm)"] = Discretiser(method="quantile",
                                          num_buckets=3).fit_transform(
                                              df["sepal length (cm)"].values)
    return df

Esempio n. 28

0

Mostra file

    def test_fit_transform(self):
        """fit transform should give the same result as calling fit and
        transform separately"""

        arr = np.array([n + 1 for n in range(10)])
        np.random.shuffle(arr)

        d1 = Discretiser(method="uniform", num_buckets=4)
        d1.fit(arr)
        r1 = d1.transform(arr)

        d2 = Discretiser(method="uniform", num_buckets=4)
        r2 = d2.fit_transform(arr)

        assert np.array_equal(r1, r2)

Esempio n. 29

0

Mostra file

    def test_fit_transform(self):
        """fit transform should give the same result as calling fit and
        transform separately"""

        arr = np.array([n + 1 for n in range(10)])
        np.random.shuffle(arr)

        d1 = Discretiser(method="outlier", outlier_percentile=0.2)
        d1.fit(arr)
        r1 = d1.transform(arr)

        d2 = Discretiser(method="outlier", outlier_percentile=0.2)
        r2 = d2.fit_transform(arr)

        assert np.array_equal(r1, r2)

Esempio n. 30

0

Mostra file

    def test_fit_transform(self):
        """fit transform should give the same result as calling fit and
        transform separately"""

        arr = np.array([n + 1 for n in range(10)])
        np.random.shuffle(arr)

        d1 = Discretiser(method="percentiles",
                         percentile_split_points=[0.10, 0.40, 0.85])
        d1.fit(arr)
        r1 = d1.transform(arr)

        d2 = Discretiser(method="percentiles",
                         percentile_split_points=[0.10, 0.40, 0.85])
        r2 = d2.fit_transform(arr)

        assert np.array_equal(r1, r2)