def test_outlier_upper_percentile(self): """the split point for upper outliers should be at range - provided percentile""" arr = np.array(range(100001)) np.random.shuffle(arr) d = Discretiser(method="outlier", outlier_percentile=0.2) d.fit(arr) assert d.numeric_split_points[1] == 80000
def test_fit_uniform_data(self): """Fitting uniform data should produce uniform splits""" arr = np.array(range(100001)) np.random.shuffle(arr) d = Discretiser(method="quantile", num_buckets=4) d.fit(arr) assert np.array_equal([25000, 50000, 75000], d.numeric_split_points)
def test_fit_uniform_data(self): """Fitting uniform data should produce expected percentile splits of uniform distribution""" arr = np.array(range(100001)) np.random.shuffle(arr) d = Discretiser(method="percentiles", percentile_split_points=[0.1, 0.4, 0.85]) d.fit(arr) assert np.array_equal([10000, 40000, 85000], d.numeric_split_points)
def test_fit_creates_exactly_uniform_splits_when_possible(self): """splits should be exactly uniform if possible""" arr = np.array(range(20)) np.random.shuffle(arr) d = Discretiser(method="uniform", num_buckets=4) d.fit(arr) for n in range(2): assert 4 < (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 5
def test_fixed_split_points_monotonic(self): """a value error should be raised if numeric split points are not monotonically increasing""" Discretiser(method="fixed", numeric_split_points=[-1, -0, 0, 1]) with pytest.raises( ValueError, match="numeric_split_points must be monotonically increasing", ): Discretiser(method="fixed", numeric_split_points=[1, -1])
def test_outlier_lt_half(self): """a value error should be raised if outlier is not < 0.5""" Discretiser(method="outlier", outlier_percentile=0.49) with pytest.raises( ValueError, match="outlier_percentile must be between 0 and 0.5", ): Discretiser(method="outlier", outlier_percentile=0.5)
def test_transform_smaller_than_fit_range_goes_into_first_bucket(self): """If a value smaller than the input is transformed, then it should go into the minimum bucket""" arr = np.array([n + 1 for n in range(10)]) np.random.shuffle(arr) d = Discretiser(method="uniform", num_buckets=4) d.fit(arr) assert np.array_equal([0], d.transform(np.array([-101])))
def test_percentile_split_points_monotonic(self): """a value error should be raised if percentile split points are not monotonically increasing""" Discretiser(method="percentiles", percentile_split_points=[0, -0, 0.1, 1]) with pytest.raises( ValueError, match="percentile_split_points must be monotonically increasing", ): Discretiser(method="percentiles", percentile_split_points=[1, 0.1])
def test_fit_gauss_data(self): """Fitting gauss data should produce percentile splits of standard normal distribution""" arr = np.random.normal(loc=0, scale=1, size=100001) np.random.shuffle(arr) d = Discretiser(method="percentiles", percentile_split_points=[0.1, 0.4, 0.85]) d.fit(arr) assert math.isclose(-1.2815, d.numeric_split_points[0], abs_tol=0.025) assert math.isclose(-0.253, d.numeric_split_points[1], abs_tol=0.025) assert math.isclose(1.036, d.numeric_split_points[2], abs_tol=0.025)
def test_fit_gauss_data(self): """Fitting gauss data should produce standard percentiles splits""" arr = np.random.normal(loc=0, scale=1, size=100001) np.random.shuffle(arr) d = Discretiser(method="quantile", num_buckets=4) d.fit(arr) assert math.isclose(-0.675, d.numeric_split_points[0], abs_tol=0.025) assert math.isclose(0, d.numeric_split_points[1], abs_tol=0.025) assert math.isclose(0.675, d.numeric_split_points[2], abs_tol=0.025)
def test_percentile_geq_zero(self): """a value error should be raised if not all percentiles split points >= 0""" Discretiser(method="percentiles", percentile_split_points=[-0.0, 0.0, 0.0001]) with pytest.raises( ValueError, match="percentile_split_points must be between 0 and 1", ): Discretiser( method="percentiles", percentile_split_points=[-0.0000001, 0.0001] )
def test_percentile_leq_1(self): """a value error should be raised if not all percentile split points <= 1""" Discretiser(method="percentiles", percentile_split_points=[0.0001, 1]) with pytest.raises( ValueError, match="{0} must be between 0 and 1".format( "percentile_split_points"), ): Discretiser(method="percentiles", percentile_split_points=[0.0001, 1.0000001])
def test_outlier_geq_zero(self): """a value error should be raised if outlier is not >= 0""" Discretiser(method="outlier", outlier_percentile=0.0) Discretiser(method="outlier", outlier_percentile=-0.0) Discretiser(method="outlier", outlier_percentile=0.1) with pytest.raises( ValueError, match="outlier_percentile must be between 0 and 0.5", ): Discretiser(method="outlier", outlier_percentile=-0.0000001)
def test_transform_gauss(self): """Fitting gauss data should transform to predictable buckets""" arr = np.random.normal(loc=0, scale=1, size=1000000) np.random.shuffle(arr) d = Discretiser(method="quantile", num_buckets=4) d.fit(arr) unique, counts = np.unique(d.transform(arr), return_counts=True) # check all 4 buckets are used assert np.array_equal([0, 1, 2, 3], unique) assert np.array_equal([250000 for n in range(4)], counts)
def test_fit_creates_close_to_uniform_splits_when_uniform_not_possible(self): """splits should be close to uniform if uniform is not possible""" arr = np.array(range(9)) np.random.shuffle(arr) d = Discretiser(method="uniform", num_buckets=4) d.fit(arr) assert len(d.numeric_split_points) == 3 for n in range(2): assert 2 <= (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 3
def test_fit_does_not_attempt_to_deal_with_identical_split_points(self): """if all data is identical, and num_buckets>1, then this is not possible. In this case the standard behaviour of numpy is followed, and many identical splits will be created. See transform for how these are applied""" arr = np.array([1 for _ in range(20)]) d = Discretiser(method="uniform", num_buckets=4) d.fit(arr) assert np.array_equal( np.array([d.numeric_split_points[0] for _ in range(3)]), d.numeric_split_points, )
def test_transform_uniform(self): """Fitting uniform data should transform to predictable buckets""" arr = np.array(range(100001)) np.random.shuffle(arr) d = Discretiser(method="percentiles", percentile_split_points=[0.10, 0.40, 0.85]) d.fit(arr) unique, counts = np.unique(d.transform(arr), return_counts=True) # check all 4 buckets are used assert np.array_equal([0, 1, 2, 3], unique) assert np.array_equal([10000, 30000, 45000, 15001], counts)
def test_transform_outlier(self): """transforming outliers should put the expected amount of data in each bucket""" arr = np.array(range(100001)) np.random.shuffle(arr) d = Discretiser(method="outlier", outlier_percentile=0.2) d.fit(arr) unique, counts = np.unique(d.transform(arr), return_counts=True) # check all 3 buckets are used assert np.array_equal([0, 1, 2], unique) # check largest difference in outliers is 1 print(counts) assert np.abs(counts[0] - counts[2]) <= 1
def test_transform_uneven_split(self): """Data that cannot be split evenly between buckets should be transformed into near-even buckets""" arr = np.array([n + 1 for n in range(10)]) np.random.shuffle(arr) d = Discretiser(method="uniform", num_buckets=4) d.fit(arr) unique, counts = np.unique(d.transform(arr), return_counts=True) # check all 4 buckets are used assert np.array_equal([0, 1, 2, 3], unique) # check largest difference in distribution is 1 item assert (np.max(counts) - np.min(counts)) <= 1
def test_fixed_split_points(self): """a value error should be raised if method=fixed and no numeric split points are provided""" selected_method = "fixed" with pytest.raises( ValueError, match=f"{selected_method} method expects numeric_split_points", ): Discretiser(method=selected_method)
def test_quantile_requires_num_buckets(self): """a value error should be raised if method=quantile and num_buckets is not provided""" selected_method = "quantile" with pytest.raises( ValueError, match=f"{selected_method} method expects num_buckets", ): Discretiser(method=selected_method)
def test_outlier_requires_outlier_percentile(self): """a value error should be raised if method=outlier and outlier_percentile is not provided""" selected_method = "outlier" with pytest.raises( ValueError, match=f"{selected_method} method expects outlier_percentile", ): Discretiser(method=selected_method)
def test_percentile_requires_percentile_split_points(self): """a value error should be raised if method=percentiles and no percentile split points are provided""" selected_method = "percentiles" with pytest.raises( ValueError, match=f"{selected_method} method expects percentile_split_points", ): Discretiser(method=selected_method)
def test_uniform_requires_num_buckets(self): """a value error should be raised if method=uniform and num_buckets is not provided""" selected_method = "uniform" with pytest.raises( ValueError, match="{0} method expects {1}".format(selected_method, "num_buckets"), ): Discretiser(method=selected_method)
def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame: """ Helper method to discretise input data using parameters in `discretiser_kwargs` and `discretiser_alg`. The splitting thresholds are extracted from the training data Args: X (pd.DataFrame): a dataframe to be discretised Returns: a discretised version of the input dataframe """ X = X.copy() for col in self.discretiser_alg.keys(): if self.discretiser_alg[col] == "unsupervised": if self.discretiser_kwargs[col]["method"] == "fixed": X[col] = Discretiser( **self.discretiser_kwargs[col]).transform( X[col].values) else: discretiser = Discretiser( **self.discretiser_kwargs[col]).fit( self._discretise_data[col].values) X[col] = discretiser.transform(X[col].values) else: if self.discretiser_alg[col] == "tree": discretiser = DecisionTreeSupervisedDiscretiserMethod( mode="single", tree_params=self.discretiser_kwargs[col]) elif self.discretiser_alg[col] == "mdlp": discretiser = MDLPSupervisedDiscretiserMethod( self.discretiser_kwargs[col]) discretiser.fit( dataframe=self._discretise_data, feat_names=[col], target=self._target_name, target_continuous=False, ) X[col] = discretiser.transform(X[[col]]) return X
def test_invalid_method(self): """a value error should be raised if an invalid method is given""" allowed_methods = ["uniform", "quantile", "outlier", "fixed", "percentiles"] selected_method = "INVALID" with pytest.raises( ValueError, match=f"{selected_method} is not a recognised method. " f"Use one of: {' '.join(allowed_methods)}", ): Discretiser(method=selected_method)
def iris_test_data() -> pd.DataFrame: """ Iris dataset to test sklearn wrappers """ iris = load_iris() X, y = iris["data"], iris["target"] names = iris["feature_names"] df = pd.DataFrame(X, columns=names) df["type"] = y df["sepal length (cm)"] = Discretiser(method="quantile", num_buckets=3).fit_transform( df["sepal length (cm)"].values) return df
def test_fit_transform(self): """fit transform should give the same result as calling fit and transform separately""" arr = np.array([n + 1 for n in range(10)]) np.random.shuffle(arr) d1 = Discretiser(method="uniform", num_buckets=4) d1.fit(arr) r1 = d1.transform(arr) d2 = Discretiser(method="uniform", num_buckets=4) r2 = d2.fit_transform(arr) assert np.array_equal(r1, r2)
def test_fit_transform(self): """fit transform should give the same result as calling fit and transform separately""" arr = np.array([n + 1 for n in range(10)]) np.random.shuffle(arr) d1 = Discretiser(method="outlier", outlier_percentile=0.2) d1.fit(arr) r1 = d1.transform(arr) d2 = Discretiser(method="outlier", outlier_percentile=0.2) r2 = d2.fit_transform(arr) assert np.array_equal(r1, r2)
def test_fit_transform(self): """fit transform should give the same result as calling fit and transform separately""" arr = np.array([n + 1 for n in range(10)]) np.random.shuffle(arr) d1 = Discretiser(method="percentiles", percentile_split_points=[0.10, 0.40, 0.85]) d1.fit(arr) r1 = d1.transform(arr) d2 = Discretiser(method="percentiles", percentile_split_points=[0.10, 0.40, 0.85]) r2 = d2.fit_transform(arr) assert np.array_equal(r1, r2)