def relative_entropy_terms(self, background=None): """ Computes a row-wise relative entropy terms per motif and stores them in a DictArray. Parameters ---------- background : dict {motif_1: prob_1, motif_2: prob_2, ...} is the specified background distribution. Returns ------- DictArray Notes ----- If background is type None, it defaults to equifrequent. """ if background is None: num_motifs = len(self.motifs) background = array([1 / num_motifs] * num_motifs) else: background = array([background.get(m, 0) for m in self.motifs]) validate_freqs_array(background) ret = background * (safe_log(background) - safe_log(self.array)) return self.template.wrap(ret)
def __init__(self, data, motifs, row_indices=None): super(MotifFreqsArray, self).__init__(data, motifs, row_indices, dtype=float) axis = 0 if self.array.ndim == 1 else 1 validate_freqs_array(self.array, axis=axis)
def __init__(self, data, motifs, row_indices=None, background=None): data = numpy.array(data) row_sum = data.sum(axis=1) # are we dealing with counts data? if 0 <= data.min() and 1 < data.max(): # convert to freqs data data = data / numpy.vstack(row_sum) row_sum = data.sum(axis=1) # are we dealing with freqs data? if (data >= 0).all() and numpy.allclose( row_sum[numpy.isnan(row_sum) == False], 1 ): # standard PSSM object creation if background is None: background = numpy.ones(len(motifs), dtype=float) / len(motifs) self._background = numpy.array(background) assert len(background) == len( motifs ), "Mismatch between number of motifs and the background" validate_freqs_array(self._background) pssm = safe_log(data) - safe_log(self._background) super(PSSM, self).__init__( pssm, motifs, row_indices=row_indices, dtype=float ) self._indices = numpy.arange(self.shape[0]) # used for scoring return if not (data.min() < 0 < data.max()): raise ValueError("PSSM has been supplied invalid data") # we dealing with pssm data super(PSSM, self).__init__(data, motifs, row_indices=row_indices, dtype=float) self._indices = numpy.arange(self.shape[0]) # used for scoring
def jsd(freqs1, freqs2, validate=False): """calculate Jensen–Shannon divergence between two probability distributions Parameters ---------- freqs1 : one dimensional array row vector frequencies, sum to 1 freqs2 : one dimensional array row vector frequencies, sum to 1 validate : bool """ # Convert input arrays into numpy arrays freqs1 = array(freqs1) freqs2 = array(freqs2) if validate: assert_equal(freqs1.shape, freqs2.shape, err_msg="freqs1/freqs2 mismatched shape") assert freqs1.ndim == 1, "freqs1 has incorrect dimension" assert freqs2.ndim == 1, "freqs2 has incorrect dimension" try: validate_freqs_array(freqs1) validate_freqs_array(freqs2) except ValueError as err: raise AssertionError("freqs not valid") from err H_mn = safe_p_log_p(freqs1 / 2 + freqs2 / 2).sum() mn_H = sum([sum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2 return H_mn - mn_H
def jsd(freqs1, freqs2, validate=False): """calculate Jensen–Shannon divergence between two probability distributions Parameters ---------- freqs1 : one dimensional array row vector frequencies, sum to 1 freqs2 : one dimensional array row vector frequencies, sum to 1 validate : bool """ # Convert input arrays into numpy arrays freqs1 = array(freqs1) freqs2 = array(freqs2) if validate: assert_equal(freqs1.shape, freqs2.shape, err_msg="freqs1/freqs2 mismatched shape") assert freqs1.ndim == 1, "freqs1 has incorrect dimension" assert freqs2.ndim == 1, "freqs2 has incorrect dimension" try: validate_freqs_array(freqs1) validate_freqs_array(freqs2) except ValueError as err: raise AssertionError("freqs not valid") from err H_mn = fsum(safe_p_log_p(freqs1 / 2 + freqs2 / 2)) mn_H = fsum([fsum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2 jsd_ = H_mn - mn_H if jsd_ < 0 and isclose(jsd_, 0, atol=1e-10): jsd_ = 0 elif jsd_ < 0: raise ArithmeticError( f"{jsd_} is negative and below defined precision threshold") return jsd_