def jsd(freqs1, freqs2, validate=False): """ Parameters ---------- freqs1 : one dimensional array row vector frequencies, sum to 1 freqs2 : one dimensional array row vector frequencies, sum to 1 validate : bool Returns ------- the mathematical calculation of Jensen–Shannon divergence between two probability distributions """ # Convert input arrays into numpy arrays freqs1 = array(freqs1) freqs2 = array(freqs2) if validate: assert_equal(freqs1.shape, freqs2.shape, err_msg="freqs1/freqs2 mismatched shape") assert freqs1.ndim == 1, "freqs1 has incorrect dimension" assert freqs2.ndim == 1, "freqs2 has incorrect dimension" assert_allclose(sum(freqs1), 1, err_msg="invalid freqs1") assert_allclose(sum(freqs2), 1, err_msg="invalid freqs2") H_mn = safe_p_log_p(freqs1 / 2 + freqs2 / 2).sum() mn_H = sum([sum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2 return H_mn - mn_H
def jsd(freqs1, freqs2, validate=False): """calculate Jensen–Shannon divergence between two probability distributions Parameters ---------- freqs1 : one dimensional array row vector frequencies, sum to 1 freqs2 : one dimensional array row vector frequencies, sum to 1 validate : bool """ # Convert input arrays into numpy arrays freqs1 = array(freqs1) freqs2 = array(freqs2) if validate: assert_equal(freqs1.shape, freqs2.shape, err_msg="freqs1/freqs2 mismatched shape") assert freqs1.ndim == 1, "freqs1 has incorrect dimension" assert freqs2.ndim == 1, "freqs2 has incorrect dimension" try: validate_freqs_array(freqs1) validate_freqs_array(freqs2) except ValueError as err: raise AssertionError("freqs not valid") from err H_mn = safe_p_log_p(freqs1 / 2 + freqs2 / 2).sum() mn_H = sum([sum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2 return H_mn - mn_H
def entropy_terms(self): """Returns ------- entropies : array Has same dimension as self.array with safe log operation applied. """ entropies = safe_p_log_p(self.array) return self.template.wrap(entropies)
def test_safe_p_log_p(self): """safe_p_log_p: should handle pos/neg/zero/empty arrays""" # normal valid array a = array([[4, 0, 8], [2, 16, 4]]) assert_equal(safe_p_log_p(a), array([[-8, 0, -24], [-2, -64, -8]])) # just zeros a = array([[0, 0], [0, 0]]) assert_equal(safe_p_log_p(a), array([[0, 0], [0, 0]])) # negative number -- throw error with self.assertRaises(FloatingPointError): safe_p_log_p(array([-4])) # integer input, float output assert_allclose(safe_p_log_p(array([3])), array([-4.75488750])) # empty array assert_equal(safe_p_log_p(array([])), array([]))
def jsd(freqs1, freqs2, validate=False): """calculate Jensen–Shannon divergence between two probability distributions Parameters ---------- freqs1 : one dimensional array row vector frequencies, sum to 1 freqs2 : one dimensional array row vector frequencies, sum to 1 validate : bool """ # Convert input arrays into numpy arrays freqs1 = array(freqs1) freqs2 = array(freqs2) if validate: assert_equal(freqs1.shape, freqs2.shape, err_msg="freqs1/freqs2 mismatched shape") assert freqs1.ndim == 1, "freqs1 has incorrect dimension" assert freqs2.ndim == 1, "freqs2 has incorrect dimension" try: validate_freqs_array(freqs1) validate_freqs_array(freqs2) except ValueError as err: raise AssertionError("freqs not valid") from err H_mn = fsum(safe_p_log_p(freqs1 / 2 + freqs2 / 2)) mn_H = fsum([fsum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2 jsd_ = H_mn - mn_H if jsd_ < 0 and isclose(jsd_, 0, atol=1e-10): jsd_ = 0 elif jsd_ < 0: raise ArithmeticError( f"{jsd_} is negative and below defined precision threshold") return jsd_
def entropy(self): """Shannon entropy per position using log2""" entropies = safe_p_log_p(self.array) return entropies.sum(axis=1)