def test_conditional_entropy_1d_condition(self) -> None: # Draw a sample from three-dimensional Gaussian distribution rng = np.random.default_rng(4) cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]]) data = rng.multivariate_normal([0, 0, 0], cov, size=1500) marginal = estimate_entropy(data, cond=data[:, 2]) multidim = estimate_entropy(data[:, :2], cond=data[:, 2], multidim=True) # By the chain rule of entropy, H(X|Y) = H(X,Y) - H(Y) def expected(ix: int, iy: int) -> float: joint = 0.5 * math.log( (2 * math.pi * math.e)**2 * (cov[ix, ix] * cov[iy, iy] - cov[ix, iy]**2)) single = 0.5 * math.log(2 * math.pi * math.e * cov[iy, iy]) return joint - single self.assertAlmostEqual(marginal[0], expected(0, 2), delta=0.04) self.assertAlmostEqual(marginal[1], expected(1, 2), delta=0.09) self.assertLess(marginal[2], -4.5) expected_multi = 0.5 * ( math.log(np.linalg.det(2 * math.pi * math.e * cov)) - math.log(2 * math.pi * math.e * 1)) self.assertAlmostEqual(multidim, expected_multi, delta=0.1)
def test_drop_nan_leaves_too_few_observations(self) -> None: data = [(np.nan, 2), (np.nan, 4), (5, np.nan), (7, np.nan), (9, 10), (11, 12)] cond = [(np.nan, 2), (3, 4), (5, np.nan), (7, 8), (9, 10), (11, 12)] # When multidim=False, there are three observations left # When multidim=True, there are just two for (multidim, k, should_throw) in [(False, 3, True), (False, 2, False), (True, 2, True), (True, 1, False)]: if should_throw: with self.assertRaises(ValueError) as cm: estimate_entropy(data, cond=cond, multidim=multidim, k=k, drop_nan=True) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG, f"multidim={multidim}, k={k}") else: try: estimate_entropy(data, cond=cond, multidim=multidim, k=k, drop_nan=True) except: self.fail( f"Exception occurred; multidim={multidim}, k={k}")
def test_pandas_dataframe(self) -> None: rng = np.random.default_rng(1) data = pd.DataFrame({ "N": rng.normal(0.0, 1.0, size=500), "Unif": rng.uniform(0.0, 0.5, size=500), "Exp": rng.exponential(1 / 2.0, size=500) }) marginal = estimate_entropy(data) # type: pd.DataFrame multidim = estimate_entropy(data, multidim=True) # multidim=False results in a DataFrame self.assertIsInstance(marginal, pd.DataFrame) self.assertEqual(marginal.shape, (1, 3)) self.assertAlmostEqual(marginal.loc[0, "N"], 0.5 * math.log(2 * math.pi * math.e), delta=0.04) self.assertAlmostEqual(marginal.loc[0, "Unif"], math.log(0.5), delta=0.03) self.assertAlmostEqual(marginal.loc[0, "Exp"], 1.0 - math.log(2.0), delta=0.07) # multidim=True results in a NumPy scalar # There is no reference value, the check just guards for regressions self.assertEqual(multidim.shape, ()) self.assertAlmostEqual(multidim.item(), 1.22, delta=0.02)
def test_conditional_entropy_with_independent_condition(self) -> None: rng = np.random.default_rng(6) data = rng.normal(0.0, 1.0, size=1200) cond = rng.uniform(0.0, 1.0, size=1200) uncond_result = estimate_entropy(data) cond_result = estimate_entropy(data, cond=cond) self.assertAlmostEqual(cond_result, uncond_result, delta=0.05)
def test_mi_as_sum_of_entropies(self) -> None: # Make up another distribution rng = np.random.default_rng(1) x = rng.chisquare(5, size=8000) y = rng.gamma(x, scale=1.0, size=x.shape) # We should have I(X;Y) = H(X) + H(Y) - H(X,Y) mi = estimate_mi(y, x) marginal = estimate_entropy(np.column_stack((x, y))) joint = estimate_entropy(np.column_stack((x,y)), multidim=True) self.assertAlmostEqual(np.sum(marginal) - joint, mi, delta=0.02)
def test_mi_as_conditional_entropy_difference(self) -> None: # Make up some kind of distribution rng = np.random.default_rng(0) x = rng.gamma(shape=2.0, scale=1.0, size=2000) y = rng.normal(x, scale=1.0, size=x.shape) # We should have I(X;Y) = H(X) - H(X|Y) mi = estimate_mi(y, x) ent_x = estimate_entropy(x) cond_ent = estimate_entropy(x, cond=y) self.assertAlmostEqual(ent_x - cond_ent, mi, delta=0.02)
def test_conditional_entropy_with_mask(self) -> None: # The remaining observations are identical, giving entropy of -inf # However, the masked-out variables follow very different distributions rng = np.random.default_rng(7) unif = rng.uniform(0, 1, size=1000) data = np.concatenate((unif, rng.beta(2, 3, size=400))) cond = np.concatenate((unif, rng.normal(0, 1, size=400))) mask = np.concatenate((np.full(1000, True), np.full(400, False))) unmasked = estimate_entropy(data, cond=cond) masked = estimate_entropy(data, cond=cond, mask=mask) self.assertLess(masked, unmasked - 1) self.assertLess(masked, -5)
def test_conditional_entropy_nd_condition(self) -> None: # Draw a sample from three-dimensional Gaussian distribution rng = np.random.default_rng(5) cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]]) data = rng.multivariate_normal([0, 0, 0], cov, size=1000) marginal = estimate_entropy(data, cond=data) multidim = estimate_entropy(data, cond=data, multidim=True) # All of the entropies should be -inf, but practically this does not happen self.assertLess(marginal[0], -0.5) self.assertLess(marginal[1], -0.5) self.assertLess(marginal[2], -0.5) self.assertLess(multidim, -1.5)
def test_entropy_bias(self) -> None: rng = np.random.default_rng(0) x = rng.normal(size=20_000) h_1 = estimate_entropy(x, k=1) h_100 = estimate_entropy(x, k=100) # Small k has positive bias, large k has negative bias expected = 0.5 * log(2 * pi * e) self.assertGreater(h_1, expected + 0.005) self.assertLess(h_100, expected - 0.005) # Still, both are reasonably close, and large k is closer self.assertAlmostEqual(h_1, expected, delta=0.04) self.assertAlmostEqual(h_100, expected, delta=0.01)
def test_single_dimensional_variable_as_list(self) -> None: rng = np.random.default_rng(0) x = [rng.uniform(0, 2) for _ in range(400)] result = estimate_entropy(x) self.assertEqual(result.shape, ()) self.assertAlmostEqual(result, math.log(2 - 0), delta=0.01)
def test_nans_must_be_masked(self) -> None: rng = np.random.default_rng(3) data = rng.normal(0.0, 1.0, size=(800, 2)) data[0:10, 0] = np.nan data[5:15, 1] = np.nan # Without masking, the NaNs are rejected with self.assertRaises(ValueError) as cm: estimate_entropy(data) self.assertEqual(str(cm.exception), NANS_LEFT_MSG) # With masking, a correct result is produced mask = np.full(800, True) mask[0:15] = False result = estimate_entropy(data, mask=mask) expected = 0.5 * math.log(2 * math.pi * math.e) self.assertAlmostEqual(result[0], expected, delta=0.03) self.assertAlmostEqual(result[1], expected, delta=0.03)
def test_drop_nan_cond(self) -> None: # Independent condition rng = np.random.default_rng(10) data = rng.uniform(size=1000) cond = rng.uniform(size=1000) cond[:10] = np.nan result = estimate_entropy(data, cond=cond, drop_nan=True) self.assertAlmostEqual(result, 0.0, delta=0.04)
def test_pandas_series(self) -> None: rng = np.random.default_rng(2) data = pd.Series(rng.normal(0.0, 1.0, size=500), name="N") result = estimate_entropy(data) # type: pd.DataFrame self.assertIsInstance(result, pd.DataFrame) self.assertEqual(result.shape, (1, 1)) self.assertAlmostEqual(result.loc[0, "N"], 0.5 * math.log(2 * math.pi * math.e), delta=0.02)
def test_drop_nan_separate_vars(self) -> None: rng = np.random.default_rng(8) data = np.column_stack( (rng.uniform(0, 2, size=2000), rng.uniform(0, 3, size=2000))) data[:1000, 0] = np.nan data[1000:, 1] = np.nan result = estimate_entropy(data, drop_nan=True) self.assertAlmostEqual(result[0], math.log(2), delta=0.04) self.assertAlmostEqual(result[1], math.log(3), delta=0.04)
def test_conditional_entropy_bias(self) -> None: # This is especially interesting as errors might not cancel out in the chain rule # Use the 3D Gaussian distribution seen in the driver test rng = np.random.default_rng(0) cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]]) data = rng.multivariate_normal([0, 0, 0], cov, size=20_000) h_5 = estimate_entropy(data[:, :2], cond=data[:, 2], multidim=True, k=5) h_50 = estimate_entropy(data[:, :2], cond=data[:, 2], multidim=True, k=50) # Again, large k appears to have more negative bias expected = 0.5 * (log(np.linalg.det(2 * pi * e * cov)) - log(2 * pi * e)) self.assertAlmostEqual(h_5, expected, delta=0.005) self.assertAlmostEqual(h_50, expected, delta=0.03)
def test_drop_nan_multidim(self) -> None: rng = np.random.default_rng(9) cov = np.asarray([[1, 0.6], [0.6, 2]]) data = rng.multivariate_normal([0, 0], cov, size=1000) data[:50, 0] = np.nan data[950:, 1] = np.nan result = estimate_entropy(data, multidim=True, drop_nan=True) self.assertAlmostEqual(result, math.log(2 * math.pi * math.e) + 0.5 * math.log(2 - 0.6**2), delta=0.05)
def test_multidim_interpretation(self) -> None: # Generate a two-dimensional Gaussian variable rng = np.random.default_rng(1) cov = np.asarray([[1, 0.6], [0.6, 2]]) data = rng.multivariate_normal([0, 0], cov, size=1500) marginal = estimate_entropy(data) multidim = estimate_entropy(data, multidim=True) # If multidim=False, we get marginal entropies self.assertEqual(marginal.shape, (2, )) self.assertAlmostEqual(marginal[0], 0.5 * math.log(2 * math.pi * math.e * 1), delta=0.03) self.assertAlmostEqual(marginal[1], 0.5 * math.log(2 * math.pi * math.e * 2), delta=0.03) # If multidim=True, we get the combined entropy self.assertEqual(multidim.shape, ()) self.assertAlmostEqual(multidim.item(), math.log(2 * math.pi * math.e) + 0.5 * math.log(2 - 0.6**2), delta=0.04)
def test_drop_nan_cond_multidim(self) -> None: # See test_conditional_entropy_1d_condition rng = np.random.default_rng(11) cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]]) data = rng.multivariate_normal([0, 0, 0], cov, size=1500) data[10:20, 0] = np.nan data[20:30, 1] = np.nan data[25:40, 2] = np.nan result = estimate_entropy(data[:, :2], cond=data[:, 2], multidim=True, drop_nan=True) expected = 0.5 * (math.log(np.linalg.det(2 * math.pi * math.e * cov)) - math.log(2 * math.pi * math.e * 1)) self.assertAlmostEqual(result, expected, delta=0.1)
def test_mask_is_not_boolean(self) -> None: with self.assertRaises(TypeError) as cm: estimate_entropy(np.zeros(5), mask=[1, 2, 3, 4, 5]) self.assertEqual(str(cm.exception), INVALID_MASK_TYPE_MSG)
def test_cond_has_wrong_dimension(self) -> None: for dim in [(), (20, 2, 1)]: with self.subTest(dim=dim): with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(20), cond=np.zeros(dim)) self.assertEqual(str(cm.exception), COND_WRONG_DIMENSION_MSG)
def test_cond_must_have_same_length_as_x(self) -> None: with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(5), cond=np.zeros(7)) self.assertEqual(str(cm.exception), X_COND_DIFFERENT_LENGTH_MSG)
def test_mask_leaves_too_few_observations(self) -> None: with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(5), mask=[False, False, False, True, True]) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_mask_has_wrong_dimension(self) -> None: with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros((5, 2)), mask=np.full((5, 2), True)) self.assertEqual(str(cm.exception), MASK_WRONG_DIMENSION_MSG)
def test_input_shorter_than_k(self) -> None: with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(3), k=3) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_k_must_be_integer(self) -> None: with self.assertRaises(TypeError): estimate_entropy(np.zeros(20), k=2.71828) # type: ignore
def test_k_must_be_positive(self) -> None: for k in [-2, 0]: with self.subTest(k=k): with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(20), k=k) self.assertEqual(str(cm.exception), K_NEGATIVE_MSG)
def test_mask_has_wrong_size(self) -> None: with self.assertRaises(ValueError) as cm: estimate_entropy(np.zeros(5), mask=[True, False]) self.assertEqual(str(cm.exception), INVALID_MASK_LENGTH_MSG)