def js_divergence(P, Q): """Jensen-Shannon divergence between `P` and `Q`. Parameters ---------- P, Q (np.ndarray) Two discrete distributions represented as 1D arrays. They are assumed to have the same support Returns ------- float The Jensen-Shannon divergence between `P` and `Q`. """ M = 0.5 * (P + Q) jsd = 0.5 * (sp_entropy(P, M, base=2) + sp_entropy(Q, M, base=2)) # If the input distributions are identical, floating-point error in the # construction of the mixture matrix can result in negative values that are # very close to zero. If one wants to compute the root-JSD metric, these # negative values lead to undesirable nans. if np.isclose(jsd, 0.0): return 0 else: return jsd
def joint_entropy(data): r"""Joint entropy of all variables in the data. Parameters ---------- data (np.ndarray) Array of data with variables as columns and observations as rows. Returns ------- float Joint entropy of the variables of interests. Notes ----- 1. :math:`H(\{X_i\}) = - \sum p(\{X_i\}) \log_2(p(\{X_i\}))` 2. The data of variables must be categorical. """ # Entropy is computed through summing contribution of states with # non-zero empirical probability in the data count = defaultdict(int) for state in data: key = tuple(state) count[key] += 1 return sp_entropy(list(count.values()), base=2)
def error(alpha, n): """Return the actual error and the estimated uncertainty (normalized)""" k = len(alpha) pvals = dirichlet(alpha) counts = multinomial(n, pvals) h0 = sp_entropy(pvals) h, std = ndd.entropy(counts, k=k, return_std=True) return (h - h0) / h0, std / h0
def js_divergence(P, Q): """Jensen-Shannon divergence between `P` and `Q`. Parameters ---------- P, Q (np.ndarray) Two discrete distributions represented as 1D arrays. They are assumed to have the same support Returns ------- float The Jensen-Shannon divergence between `P` and `Q`. """ M = 0.5 * (P + Q) return 0.5 * (sp_entropy(P, M, base=2) + sp_entropy(Q, M, base=2))
def test_entropy_execution(setup): rs = np.random.RandomState(0) a = rs.rand(10) t1 = tensor(a, chunk_size=4) r = entropy(t1) result = r.execute().fetch() expected = sp_entropy(a) np.testing.assert_array_almost_equal(result, expected) b = rs.rand(10) base = 3.1 t2 = tensor(b, chunk_size=4) r = entropy(t1, t2, base) result = r.execute().fetch() expected = sp_entropy(a, b, base) np.testing.assert_array_almost_equal(result, expected) b = rs.rand(10) base = 3.1 t2 = tensor(b, chunk_size=4) r = entropy(t1, t2, base) result = r.execute().fetch() expected = sp_entropy(a, b, base) np.testing.assert_array_almost_equal(result, expected) r = entropy(t1, t2, t1.sum()) result = r.execute().fetch() expected = sp_entropy(a, b, a.sum()) np.testing.assert_array_almost_equal(result, expected) with pytest.raises(ValueError): entropy(t1, t2[:7])
def testEntropyExecution(self): rs = np.random.RandomState(0) a = rs.rand(10) t1 = tensor(a, chunk_size=4) r = entropy(t1) result = self.executor.execute_tensor(r, concat=True)[0] expected = sp_entropy(a) np.testing.assert_array_almost_equal(result, expected) b = rs.rand(10) base = 3.1 t2 = tensor(b, chunk_size=4) r = entropy(t1, t2, base) result = self.executor.execute_tensor(r, concat=True)[0] expected = sp_entropy(a, b, base) np.testing.assert_array_almost_equal(result, expected) b = rs.rand(10) base = 3.1 t2 = tensor(b, chunk_size=4) r = entropy(t1, t2, base) result = self.executor.execute_tensor(r, concat=True)[0] expected = sp_entropy(a, b, base) np.testing.assert_array_almost_equal(result, expected) r = entropy(t1, t2, t1.sum()) result = self.executor.execute_tensor(r, concat=True)[0] expected = sp_entropy(a, b, a.sum()) np.testing.assert_array_almost_equal(result, expected) with self.assertRaises(ValueError): entropy(t1, t2[:7])
def test_entropy_random(n_samples, base, use_handle): handle, stream = get_handle(use_handle) clustering, _ = \ generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples)) # generate unormalized probabilities from clustering pk = np.bincount(clustering) # scipy's entropy uses probabilities sp_S = sp_entropy(pk, base=base) # we use a clustering S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle) assert_almost_equal(S, sp_S, decimal=2)
def test_entropy_random(n_samples, base, use_handle): if has_scipy(): from scipy.stats import entropy as sp_entropy else: pytest.skip('Skipping test_entropy_random because Scipy is missing') handle, stream = get_handle(use_handle) clustering, _, _, _ = \ generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples)) # generate unormalized probabilities from clustering pk = np.bincount(clustering) # scipy's entropy uses probabilities sp_S = sp_entropy(pk, base=base) # we use a clustering S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle) assert_almost_equal(S, sp_S, decimal=2)
def D_KL(p, q, base=None): """Compute Kullback-Leibler divergence between PDs p and q.""" D = sp_entropy(p, q, base=base) return D
def entropy(p, base=None): """Compute entropy of probability distribution p.""" H = sp_entropy(p, base=base) return H
def scipy_entropy(counts, k): # pylint: disable=unused-argument """scipy.stats.entropy() execution time""" start = time.time() _ = sp_entropy(counts) end = time.time() return end - start, 0