Example #1
0
    def relative_entropy_terms(self, background=None):
        """
        Computes a row-wise relative entropy terms per motif and stores them in a DictArray.

        Parameters
        ----------
        background : dict
            {motif_1: prob_1, motif_2: prob_2, ...} is the specified background distribution.

        Returns
        -------
        DictArray


        Notes
        -----
        If background is type None, it defaults to equifrequent.
        """
        if background is None:
            num_motifs = len(self.motifs)
            background = array([1 / num_motifs] * num_motifs)
        else:
            background = array([background.get(m, 0) for m in self.motifs])

        validate_freqs_array(background)
        ret = background * (safe_log(background) - safe_log(self.array))
        return self.template.wrap(ret)
Example #2
0
 def __init__(self, data, motifs, row_indices=None):
     super(MotifFreqsArray, self).__init__(data,
                                           motifs,
                                           row_indices,
                                           dtype=float)
     axis = 0 if self.array.ndim == 1 else 1
     validate_freqs_array(self.array, axis=axis)
Example #3
0
    def __init__(self, data, motifs, row_indices=None, background=None):
        data = numpy.array(data)
        row_sum = data.sum(axis=1)

        # are we dealing with counts data?
        if 0 <= data.min() and 1 < data.max():
            # convert to freqs data
            data = data / numpy.vstack(row_sum)
            row_sum = data.sum(axis=1)

        # are we dealing with freqs data?
        if (data >= 0).all() and numpy.allclose(
            row_sum[numpy.isnan(row_sum) == False], 1
        ):
            # standard PSSM object creation
            if background is None:
                background = numpy.ones(len(motifs), dtype=float) / len(motifs)
            self._background = numpy.array(background)
            assert len(background) == len(
                motifs
            ), "Mismatch between number of motifs and the background"
            validate_freqs_array(self._background)
            pssm = safe_log(data) - safe_log(self._background)
            super(PSSM, self).__init__(
                pssm, motifs, row_indices=row_indices, dtype=float
            )
            self._indices = numpy.arange(self.shape[0])  # used for scoring
            return

        if not (data.min() < 0 < data.max()):
            raise ValueError("PSSM has been supplied invalid data")

        # we dealing with pssm data
        super(PSSM, self).__init__(data, motifs, row_indices=row_indices, dtype=float)
        self._indices = numpy.arange(self.shape[0])  # used for scoring
Example #4
0
def jsd(freqs1, freqs2, validate=False):
    """calculate Jensen–Shannon divergence between two probability distributions

    Parameters
    ----------
    freqs1 : one dimensional array
        row vector frequencies, sum to 1
    freqs2 : one dimensional array
        row vector frequencies, sum to 1
    validate : bool

    """
    # Convert input arrays into numpy arrays
    freqs1 = array(freqs1)
    freqs2 = array(freqs2)

    if validate:
        assert_equal(freqs1.shape,
                     freqs2.shape,
                     err_msg="freqs1/freqs2 mismatched shape")
        assert freqs1.ndim == 1, "freqs1 has incorrect dimension"
        assert freqs2.ndim == 1, "freqs2 has incorrect dimension"
        try:
            validate_freqs_array(freqs1)
            validate_freqs_array(freqs2)
        except ValueError as err:
            raise AssertionError("freqs not valid") from err

    H_mn = safe_p_log_p(freqs1 / 2 + freqs2 / 2).sum()
    mn_H = sum([sum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2
    return H_mn - mn_H
Example #5
0
def jsd(freqs1, freqs2, validate=False):
    """calculate Jensen–Shannon divergence between two probability distributions

    Parameters
    ----------
    freqs1 : one dimensional array
        row vector frequencies, sum to 1
    freqs2 : one dimensional array
        row vector frequencies, sum to 1
    validate : bool

    """
    # Convert input arrays into numpy arrays
    freqs1 = array(freqs1)
    freqs2 = array(freqs2)

    if validate:
        assert_equal(freqs1.shape,
                     freqs2.shape,
                     err_msg="freqs1/freqs2 mismatched shape")
        assert freqs1.ndim == 1, "freqs1 has incorrect dimension"
        assert freqs2.ndim == 1, "freqs2 has incorrect dimension"
        try:
            validate_freqs_array(freqs1)
            validate_freqs_array(freqs2)
        except ValueError as err:
            raise AssertionError("freqs not valid") from err

    H_mn = fsum(safe_p_log_p(freqs1 / 2 + freqs2 / 2))
    mn_H = fsum([fsum(i) for i in map(safe_p_log_p, [freqs1, freqs2])]) / 2
    jsd_ = H_mn - mn_H
    if jsd_ < 0 and isclose(jsd_, 0, atol=1e-10):
        jsd_ = 0
    elif jsd_ < 0:
        raise ArithmeticError(
            f"{jsd_} is negative and below defined precision threshold")

    return jsd_