def test_lag_too_small(self) -> None: x = [1, 2, 3, 4] y = [5, 6, 7, 8] with self.assertRaises(ValueError) as cm: estimate_mi(y, x, lag=-4) self.assertEqual(str(cm.exception), TOO_LARGE_LAG_MSG)
def test_x_with_too_large_dimension(self) -> None: x = np.zeros((10, 2, 3)) y = np.zeros(10) with self.assertRaises(ValueError) as cm: estimate_mi(y, x) self.assertEqual(str(cm.exception), X_WRONG_DIMENSION_MSG)
def test_cond_lag_leaves_no_y_observations(self) -> None: x = [1, 2, 3, 4] y = [5, 6, 7, 8] with self.assertRaises(ValueError) as cm: estimate_mi(y, x, lag=1, cond=y, cond_lag=4) self.assertEqual(str(cm.exception), TOO_LARGE_LAG_MSG)
def test_lag_leaves_too_few_observations(self) -> None: x = np.zeros(30) y = np.zeros(30) with self.assertRaises(ValueError) as cm: estimate_mi(y, x, lag=[-5, 10], k=15) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_mask_with_wrong_length(self) -> None: x = [1, 2, 3, 4] y = [5, 6, 7, 8] with self.assertRaises(ValueError) as cm: estimate_mi(y, x, mask=[False, True]) self.assertEqual(str(cm.exception), INVALID_MASK_LENGTH_MSG)
def test_inputs_of_different_length(self) -> None: x = np.zeros(10) y = np.zeros(20) with self.assertRaises(ValueError) as cm: estimate_mi(x, y) self.assertEqual(str(cm.exception), X_Y_DIFFERENT_LENGTH_MSG)
def test_inputs_shorter_than_k(self) -> None: x = np.zeros(3) y = np.zeros(3) with self.assertRaises(ValueError) as cm: estimate_mi(x, y, k=5) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_y_with_wrong_dimension(self) -> None: x = np.zeros(10) y = np.zeros((10, 2)) with self.assertRaises(ValueError) as cm: estimate_mi(y, x) self.assertEqual(str(cm.exception), Y_WRONG_DIMENSION_MSG)
def test_x_and_cond_different_length(self) -> None: x = np.zeros(10) y = np.zeros(20) with self.assertRaises(ValueError) as cm: estimate_mi(x, x, cond=y) self.assertEqual(str(cm.exception), X_COND_DIFFERENT_LENGTH_MSG)
def test_unmasked_nans_in_discrete_y_are_rejected(self) -> None: x = np.zeros(100) y = np.zeros(100) y[25] = np.nan with self.assertRaises(ValueError) as cm: estimate_mi(y, x, discrete_y=True) self.assertEqual(str(cm.exception), NANS_LEFT_MSG)
def test_mask_leaves_no_observations(self) -> None: x = np.zeros(30) y = np.zeros(30) mask = np.full(30, False) with self.assertRaises(ValueError) as cm: estimate_mi(y, x, mask=mask) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_mask_with_wrong_dimension(self) -> None: x = np.zeros(10) y = np.zeros(10) mask = np.zeros((10, 2), dtype=bool) with self.assertRaises(ValueError) as cm: estimate_mi(y, x, mask=mask) self.assertEqual(str(cm.exception), MASK_WRONG_DIMENSION_MSG)
def test_cond_and_mask_as_list(self) -> None: x = [1, 2, 3, 4, 5, math.nan] y = [2, 4, 6, 8, 10, 12] cond = [1, 1, 2, 3, 5, 8] mask = [True, True, True, True, True, False] # Not checking for the (bogus) result, just that this # type-checks and does not crash estimate_mi(y, x, cond=cond, mask=mask)
def test_k_must_be_positive(self) -> None: x = np.zeros(30) y = np.zeros(30) for k in [-2, 0]: with self.subTest(k=k): with self.assertRaises(ValueError) as cm: estimate_mi(x, y, k=k) self.assertEqual(str(cm.exception), K_NEGATIVE_MSG)
def test_mask_with_mixed_element_types(self) -> None: # Integer mask leads to difficult to understand subsetting behavior x = [1, 2, 3, 4] y = [5, 6, 7, 8] mask = [True, 2, 1, 0] with self.assertRaises(TypeError) as cm: estimate_mi(y, x, mask=mask) self.assertEqual(str(cm.exception), INVALID_MASK_TYPE_MSG)
def test_mask_and_lag_leave_too_few_observations(self) -> None: x = np.zeros(30) y = np.zeros(30) mask = np.full(30, True) mask[:15] = False with self.assertRaises(ValueError) as cm: estimate_mi(y, x, lag=-5, mask=mask, k=10) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_unconditional_mi_independence(self) -> None: rng = np.random.default_rng(0) cov = [[1, 0], [0, 1]] data = rng.multivariate_normal([0, 0], cov, size=20_000) mi_3 = estimate_mi(data[:, 0], data[:, 1], k=3) mi_100 = estimate_mi(data[:, 0], data[:, 1], k=100) # Large k should be better for independence testing self.assertAlmostEqual(mi_100, 0.0, delta=0.004) self.assertGreater(mi_3 - mi_100, 0.002)
def test_discrete_y(self) -> None: # See the two_disjoint_uniforms algorithm test rng = np.random.default_rng(51) y = rng.choice([0, 2], size=800) x = rng.uniform(y, y + 1) mi = estimate_mi(y, x, discrete_y=True) self.assertAlmostEqual(mi, math.log(2), delta=0.02) # If the parameters are put the wrong way, a warning is emitted with self.assertWarns(UserWarning): _ = estimate_mi(x, y, discrete_y=True)
def test_conditional_mi_independence(self) -> None: # X and X+Y are independent given Y rng = np.random.default_rng(0) x = rng.normal(0.0, 1.0, size=20_000) y = rng.normal(0.0, 1.0, size=20_000) mi_3 = estimate_mi(x, x + y, cond=x, k=3) mi_100 = estimate_mi(x, x + y, cond=x, k=100) # Large k should be better for independence testing here as well self.assertAlmostEqual(mi_100, 0.0, delta=0.005) self.assertGreater(abs(mi_3 - mi_100), 0.05)
def test_preprocess(self) -> None: # The highly different variances should cause issues rng = np.random.default_rng(2020_07_16) cov = np.asarray([[1, 0.6], [0.6, 1]]) data = rng.multivariate_normal([0, 0], cov, size=800) x = 1e3 * data[:, 0] y = 1e-3 * data[:, 1] mi_unscaled = estimate_mi(y, x, preprocess=False, normalize=True) mi_scaled = estimate_mi(y, x, preprocess=True, normalize=True) self.assertNotAlmostEqual(mi_unscaled, 0.6, delta=0.1) self.assertAlmostEqual(mi_scaled, 0.6, delta=0.03)
def test_unconditional_mi_bias(self) -> None: # A highly correlated distribution rng = np.random.default_rng(0) cov = [[1, 0.8], [0.8, 1]] data = rng.multivariate_normal([0, 0], cov, size=20_000) mi_3 = estimate_mi(data[:, 0], data[:, 1], k=3) mi_100 = estimate_mi(data[:, 0], data[:, 1], k=100) # Large k will have some bias, small k should not expected = -0.5 * log(1 - 0.8**2) self.assertAlmostEqual(mi_3, expected, delta=0.005) self.assertGreater(abs(mi_100 - expected), abs(mi_3 - expected) + 0.005)
def test_conditional_mi_with_multidimensional_cond(self) -> None: # X, Y, Z are standard normal and W = X+Y+Z. # Therefore I(X;W) < I(X;W | Y) < I(X;W | Y,Z). rng = np.random.default_rng(16) x = rng.normal(size=600) y = rng.normal(size=600) z = rng.normal(size=600) w = x + y + z single_cond = estimate_mi(w, x, cond=y) many_cond = estimate_mi(w, x, cond=np.asarray([y, z]).T) self.assertEqual(many_cond.shape, (1, 1)) self.assertAlmostEqual(single_cond.item(), 0.33, delta=0.03) self.assertGreater(many_cond.item(), 1.0)
def test_pandas_data_frame(self) -> None: # Same data as in test_array_from_file() script_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(script_path, "example_data.csv") data = pd.read_csv(data_path) actual = estimate_mi(data["y"], data[["x1", "x2", "x3"]], lag=[0, 1, 3], k=5) # type: pd.DataFrame # The returned object is a Pandas data frame, with row and column names! self.assertIsInstance(actual, pd.DataFrame) # y(t) depends on x1(t+1) self.assertAlmostEqual(actual.loc[0, "x1"], 0.0, delta=0.04) self.assertGreater(actual.loc[1, "x1"], 0.4) self.assertAlmostEqual(actual.loc[3, "x1"], 0.0, delta=0.04) # y(t) is completely independent of x2 for i in [0, 1, 3]: self.assertAlmostEqual(actual.loc[i, "x2"], 0.0, delta=0.04) # y(t) depends on abs(x3(t+3)) self.assertAlmostEqual(actual.loc[0, "x3"], 0.0, delta=0.04) self.assertAlmostEqual(actual.loc[1, "x3"], 0.0, delta=0.04) self.assertGreater(actual.loc[3, "x3"], 0.15)
def test_conditional_mi_with_mask_and_lags(self) -> None: # This is TestEstimateConditionalMi.test_three_gaussians(), # but with Z lagged by 2 and most of the observations deleted. rng = np.random.default_rng(12) cov = np.array([[1, 1, 1], [1, 4, 1], [1, 1, 9]]) data = rng.multivariate_normal([0, 0, 0], cov, size=2000) mask = np.arange(2000) % 5 == 0 x = np.zeros(2000) y = np.zeros(2000) z = np.zeros(2000) x[mask] = data[:, 0][mask] y[mask] = data[:, 1][mask] z[np.arange(2000) % 5 == 3] = data[:, 2][mask] lags = [0, -1] # Don't preprocess because with cond_lag=1 the cond array is all zeros actual = estimate_mi(y, x, lag=lags, cond=z, cond_lag=[2, 1], mask=mask, preprocess=False) expected = 0.5 * (math.log(8) + math.log(35) - math.log(9) - math.log(24)) self.assertAlmostEqual(actual[0, 0], expected, delta=0.03) self.assertAlmostEqual(actual[1, 0], 0.0, delta=0.01)
def test_array_from_file(self) -> None: # A realistic use case script_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(script_path, "example_data.csv") data = np.loadtxt(data_path, delimiter=",", skiprows=1) for max_threads in [None, 1, 2]: with self.subTest(max_threads=max_threads): actual = estimate_mi(data[:, 0], data[:, 1:4], lag=[0, 1, 3], k=5, max_threads=max_threads) # The returned object is a plain ndarray self.assertIsInstance(actual, np.ndarray) # y(t) depends on x1(t+1) self.assertAlmostEqual(actual[0, 0], 0.0, delta=0.04) self.assertGreater(actual[1, 0], 0.5) self.assertAlmostEqual(actual[2, 0], 0.0, delta=0.04) # y(t) is completely independent of x2 for i in range(3): self.assertAlmostEqual(actual[i, 1], 0.0, delta=0.04) # y(t) depends on abs(x3(t+3)) self.assertAlmostEqual(actual[0, 2], 0.0, delta=0.04) self.assertAlmostEqual(actual[1, 2], 0.0, delta=0.04) self.assertGreater(actual[2, 2], 0.15)
def test_autocorrelation(self) -> None: # Determine the autocorrelation of temperature, conditional on DOY afternoon_mask = (self.data.index.hour == 13) result = estimate_mi(self.data["Temperature"], self.data["Temperature"], lag=[0, -24, -10 * 24], cond=self.data["DayOfYear"], mask=afternoon_mask, normalize=True) # type: pd.DataFrame # The result is a 3x1 data frame self.assertEqual(result.shape, (3, 1)) self.assertIsInstance(result, pd.DataFrame) # Without lag, the autocorrelation coefficient should obviously be 1 self.assertAlmostEqual(result.loc[0, "Temperature"], 1, delta=0.01) # With one day lag, the autocorrelation is still very strong self.assertAlmostEqual(result.loc[-24, "Temperature"], 0.69, delta=0.01) # With ten day lag, the autocorrelation is close to zero self.assertAlmostEqual(result.loc[-10 * 24, "Temperature"], 0, delta=0.01)
def test_normalization(self) -> None: rng = np.random.default_rng(17) cov = np.asarray([[1, 0.6], [0.6, 1]]) data = rng.multivariate_normal([0, 0], cov, 1000) result = estimate_mi(data[:, 0], data[:, 1], normalize=True) self.assertAlmostEqual(result, 0.6, delta=0.02)
def test_conditional_mi_with_separate_lags(self) -> None: data, expected = self._create_4d_data() actual = estimate_mi(data[:, 1], data[:, 0], cond=data[:, 2:], cond_lag=[[1, -1]]) self.assertAlmostEqual(actual, expected, delta=0.08)
def test_mask_as_list(self) -> None: x = list(range(300)) # type: List[float] for i in range(0, 300, 2): x[i] = math.nan y = list(range(300, 0, -1)) mask = [True, False] * 150 self.assertGreater(estimate_mi(y, x, lag=1, mask=mask), 3)
def test_drop_nan(self) -> None: rng = np.random.default_rng(2020_07_28) cov = np.array([[1, 0.8], [0.8, 1]]) data = rng.multivariate_normal([0, 0], cov, size=1000) data[:50, 0] = np.nan data[950:, 1] = np.nan mi = estimate_mi(data[:, 1], data[:, 0], normalize=True, drop_nan=True) self.assertAlmostEqual(mi, 0.8, delta=0.02)