def test_cond_different_length(self) -> None: data = np.full((10, 2), 0.0) cond = np.full(9, 0.0) with self.assertRaises(ValueError) as cm: pairwise_mi(data, cond=cond) self.assertEqual(str(cm.exception), X_COND_DIFFERENT_LENGTH_MSG)
def test_pairwise_mi(self) -> None: # Determine the pairwise MI between three variables columns = ["Temperature", "WindDir", "DayOfYear"] afternoon_mask = (self.data.index.hour == 13) uncond = pairwise_mi(self.data[columns], mask=afternoon_mask, normalize=True) # type: pd.DataFrame cond_doy = pairwise_mi(self.data[columns], mask=afternoon_mask, normalize=True, cond=self.data["DayOfYear"]) # type: pd.DataFrame # The result is a 3x3 data frame self.assertEqual(uncond.shape, (3,3)) self.assertEqual(cond_doy.shape, (3,3)) self.assertIsInstance(uncond, pd.DataFrame) self.assertIsInstance(cond_doy, pd.DataFrame) # The matrix is symmetric self.assertEqual(uncond.loc["Temperature", "DayOfYear"], uncond.loc["DayOfYear", "Temperature"]) # Temperature is highly dependent on day of year self.assertAlmostEqual(uncond.loc["Temperature", "DayOfYear"], 0.9, delta=0.03) # There is no correlation with the conditioning variable self.assertLess(cond_doy.loc["Temperature", "DayOfYear"], 0.02) # The correlation between temperature and wind direction is # increased by conditioning on DOY self.assertGreater(cond_doy.loc["Temperature", "WindDir"], uncond.loc["Temperature", "WindDir"] + 0.1)
def test_preprocess(self) -> None: # The low variance of cond breaks the algorithm data = self.generate_normal(2020_07_16) cond = data[:, 0] * 1e-5 mi_unscaled = pairwise_mi(data, cond=cond, preprocess=False) mi_scaled = pairwise_mi(data, cond=cond, preprocess=True) self.assertNotAlmostEqual(mi_unscaled[0, 1], 0.0, delta=0.2) self.assertLess(mi_scaled[0, 1], 0.03)
def test_k_larger_than_observations(self) -> None: data = np.reshape(np.arange(20), (10, 2)) # Without mask with self.assertRaises(ValueError) as cm: pairwise_mi(data, k=10) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG) # With mask mask = np.full(10, True) mask[:5] = False with self.assertRaises(ValueError) as cm: pairwise_mi(data, k=5, mask=mask) self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
def test_normalization(self) -> None: data = self.generate_normal(104) result = pairwise_mi(data, normalize=True) self.assertEqual(result.shape, (2, 2)) self.assertTrue(np.isnan(result[0, 0])) self.assertTrue(np.isnan(result[1, 1])) self.assertAlmostEqual(result[0, 1], 0.6, delta=0.05) self.assertAlmostEqual(result[0, 1], 0.6, delta=0.05)
def test_preprocess_two_cond_vars(self) -> None: # As above, but detect if the normalization is done over the whole # cond array, not per-column. data = self.generate_normal(2020_07_27) unif = np.random.default_rng(2020_07_27).uniform(size=len(data)) * 1e6 cond = np.column_stack((data[:, 0] * 1e-5, unif)) mi_scaled = pairwise_mi(data, cond=cond, preprocess=True) self.assertLess(mi_scaled[0, 1], 0.03)
def test_conditioning(self) -> None: data = self.generate_normal(102) result = pairwise_mi(data, cond=data[:, 1]) self.assertEqual(result.shape, (2, 2)) self.assertTrue(np.isnan(result[0, 0])) self.assertTrue(np.isnan(result[1, 1])) self.assertLess(result[0, 1], 0.03) self.assertLess(result[1, 0], 0.03)
def test_ndarray(self) -> None: data = self.generate_normal(100) expected = -0.5 * math.log(1 - 0.6**2) result = pairwise_mi(data) self.assertEqual(result.shape, (2, 2)) self.assertTrue(np.isnan(result[0, 0])) self.assertTrue(np.isnan(result[1, 1])) self.assertAlmostEqual(result[0, 1], expected, delta=0.03) self.assertAlmostEqual(result[1, 0], expected, delta=0.03)
def test_mask_removes_nans(self) -> None: data = self.generate_normal(102) expected = -0.5 * math.log(1 - 0.6**2) data[0:10, 0] = np.nan data[5:15, 1] = np.nan # Without mask, the estimation should fail with self.assertRaises(ValueError) as cm: pairwise_mi(data) self.assertEqual(str(cm.exception), NANS_LEFT_MSG) # With mask, the estimation succeeds mask = np.full(1000, True) mask[0:15] = False result = pairwise_mi(data, mask=mask) self.assertEqual(result.shape, (2, 2)) self.assertTrue(np.isnan(result[0, 0])) self.assertTrue(np.isnan(result[1, 1])) self.assertAlmostEqual(result[0, 1], expected, delta=0.02) self.assertAlmostEqual(result[1, 0], expected, delta=0.02)
def test_drop_nan(self) -> None: rng = np.random.default_rng(2020_07_28) cov = np.array([[1, 0.8], [0.8, 1]]) data = rng.multivariate_normal([0, 0], cov, size=1000) data[:50, 0] = np.nan data[950:, 1] = np.nan cond = rng.uniform(size=data.shape) cond[100:120, 0] = np.nan cond[900:960, 0] = np.nan mi = pairwise_mi(data, cond=cond, normalize=True, drop_nan=True) self.assertAlmostEqual(mi[0, 1], 0.8, delta=0.02)
def test_callback(self) -> None: # Use larger N to force multithreading for N in [100, 3000]: with self.subTest(N=N): callback_results = [] def callback(i: int, j: int) -> None: callback_results.append((i, j)) rng = np.random.default_rng(105) data = rng.multivariate_normal([0] * 10, np.eye(10), N) _ = pairwise_mi(data, callback=callback) self.assertEqual(len(callback_results), 10 * 9 / 2) for (i, j) in product(range(10), range(10)): if i < j: self.assertIn((i, j), callback_results) else: self.assertNotIn((i, j), callback_results)
def test_pandas(self) -> None: rng = np.random.default_rng(101) cov = np.asarray([[1, 0.6], [0.6, 1]]) normal_data = rng.multivariate_normal([0, 0], cov, 1000) unif_data = rng.uniform(size=1000) expected = -0.5 * math.log(1 - 0.6**2) data = pd.DataFrame({ "X": normal_data[:, 0], "Y": normal_data[:, 1], "Z": unif_data }) result = pairwise_mi(data) # type: pd.DataFrame self.assertEqual(result.shape, (3, 3)) self.assertIsInstance(result, pd.DataFrame) for i in "XYZ": self.assertTrue(np.isnan(result.loc[i, i])) self.assertAlmostEqual(result.loc["X", "Y"], expected, delta=0.04) self.assertAlmostEqual(result.loc["Y", "X"], expected, delta=0.04) for i in "XY": self.assertAlmostEqual(result.loc[i, "Z"], 0.0, delta=0.03) self.assertAlmostEqual(result.loc["Z", i], 0.0, delta=0.03)
def test_data_has_three_dimensions(self) -> None: data = np.full((10, 3, 2), 0.0) with self.assertRaises(ValueError) as cm: pairwise_mi(data) self.assertEqual(str(cm.exception), X_WRONG_DIMENSION_MSG)
def test_only_one_variable_returns_nan(self) -> None: result = pairwise_mi([1, 2, 3, 4]) self.assertEqual(result.shape, (1, 1)) self.assertTrue(np.isnan(result[0, 0]))
def test_invalid_k(self) -> None: data = np.full((10, 2), 0.0) with self.assertRaises(ValueError) as cm: pairwise_mi(data, k=0) self.assertEqual(str(cm.exception), K_NEGATIVE_MSG)
def test_only_one_variable_returns_nan_2d_array(self) -> None: result = pairwise_mi([[1], [2], [3], [4]]) self.assertEqual(result.shape, (1, 1)) self.assertTrue(np.isnan(result[0, 0]))
# # STEP 2: Preprocess # # Nothing to be done, because the distributions are roughly symmetric # # STEP 3: Create a mask # afternoon_mask = (data.index.hour == 13) # # STEP 4: Plot pairwise MI # pairwise = pairwise_mi(data, mask=afternoon_mask, normalize=True) # Plot a matrix where the color represents the correlation coefficient. # We clip the color values at 0.2 because of significant random noise, # and at 0.8 to make the color constrast larger. fig, ax = plt.subplots(figsize=(8,6)) mesh = ax.pcolormesh(pairwise, vmin=0.2, vmax=0.8) fig.colorbar(mesh, label="MI correlation coefficient", extend="both") # Show the variable names on the axes ax.set_xticks(np.arange(len(data.columns)) + 0.5) ax.set_yticks(np.arange(len(data.columns)) + 0.5) ax.set_xticklabels(data.columns) ax.set_yticklabels(data.columns) ax.set_title("Unconditional MI at 15:00 local time")