Esempio n. 1
0
    def _score(self, alignment):
        if self.use_seq_weights:
            seq_weights = alignment.get_seq_weights()
        else:
            seq_weights = [1.0] * len(alignment.msa)

        if self.bg_distribution is None:
            # Estimate bg distribution from this alignment
            q = weighted_freq_count_pseudocount((aa for seq in alignment.msa for aa in seq), seq_weights, PSEUDOCOUNT)
        else:
            q = self.bg_distribution

        scores = []
        for i in xrange(len(alignment.msa[0])):
            col = get_column(i, alignment.msa)
            n_gaps = col.count("-")
            assert n_gaps < len(col)
            if self.gap_cutoff != 1 and n_gaps / len(col) > self.gap_cutoff:
                score = self.SCORE_OVER_GAP_CUTOFF
            else:
                score = self._score_col(col, seq_weights, q)
                if self.use_gap_penalty:
                    # vn_entropy has this commented out for some reason
                    score *= weighted_gap_penalty(col, seq_weights)
            scores.append(score)
        return scores
    def _score_col(self, col, seq_weights):
        """
        Calculate the relative entropy of a column col relative to a
        partition of the amino acids. Similar to Williamson '95.  See shannon_entropy()
        for more general info.
        """
        if len(self.bg_distribution) == len(self.property_partition):
            prop_bg_freq = self.bg_distribution
        else:
            # XXX: shouldn't we sum the bg distribution frequencies instead of using
            # some fixed prop bg freq?
            prop_bg_freq = self.prop_bg_freq

        fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

        # sum the aa frequencies to get the property frequencies
        prop_fc = [0.] * len(self.property_partition)
        for p in xrange(len(self.property_partition)):
            for aa in self.property_partition[p]:
                prop_fc[p] += fc[aa_to_index[aa]]

        d = 0.
        for i in xrange(len(prop_fc)):
            if prop_fc[i] and prop_bg_freq[i]:
                d += prop_fc[i] * math.log(prop_fc[i] / prop_bg_freq[i], 2)

        # Convert score so that it's between 0 and 1.
        # XXX: why is relative entropy assumed to be bounded?
        d /= math.log(len(prop_fc))

        return d
    def _score_col(self, col, seq_weights):
        """
        Calculate the entropy of a column col relative to a partition of the
        amino acids. Similar to Mirny '99.
        """
        fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

        # sum the aa frequencies to get the property frequencies
        prop_fc = [0.] * len(self.property_partition)
        for p in range(len(self.property_partition)):
            for aa in self.property_partition[p]:
                prop_fc[p] += fc[aa_to_index[aa]]

        h = 0.
        for pfc_i in prop_fc:
            if pfc_i:
                h -= pfc_i * math.log(pfc_i)

        # Convert score so that it's between 0 and 1.
        # Recall that shannon entropy is between 0 and log(number of values with nonzero freq)
        # XXX: Why involve len(col) if we have a pseudocount?
        h /= math.log(min(len(self.property_partition), len(col)))

        # Convert score so that 1 is conserved, and 0 is not.
        return 1 - h
Esempio n. 4
0
    def _score_col(self, col, seq_weights):
        """
        Calculate the relative entropy of the column distribution with a
        background distribution specified in bg_distr. This is similar to the
        approach proposed in Wang and Samudrala 06.
        """
        q = self.bg_distribution

        with_gap = (len(q) == 21)
        fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT, with_gap)
        assert len(fc) == len(q)

        d = np.sum(fc * np.log(fc/q))

        # Convert score so that it's between 0 and 1.
        # XXX: why is relative entropy assumed to be bounded?
        d /= np.log(len(fc))

        return d
Esempio n. 5
0
    def _score_col(self, col, seq_weights, q):
        """
        Return the Jensen-Shannon Divergence for the column with the background
        distribution q.
        """
        lamb1 = self.lambda_prior
        lamb2 = 1 - self.lambda_prior

        # get frequency distribution
        with_gap = len(q) == 21
        pc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT, with_gap)
        assert len(pc) == len(q)

        # make r distriubtion
        r = lamb1 * pc + lamb2 * q

        # sum relative entropies
        d1 = lamb1 * sum(pc[i] * math.log(pc[i] / r[i], 2) for i in xrange(len(pc)) if pc[i])
        d2 = lamb2 * sum(q[i] * math.log(q[i] / r[i], 2) for i in xrange(len(pc)) if pc[i])

        return d1 + d2
Esempio n. 6
0
    def _score_col(self, col, seq_weights):
        """
        Calculates the Shannon entropy of the column col.
        The entropy will be between zero and one because of its base. See p.13 of
        Valdar 02 for details. The information score 1 - h is returned for the sake
        of consistency with other scores.
        """
        fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

        h = 0.
        for fc_i in fc:
            if fc_i:
                h -= fc_i * math.log(fc_i)

        # Convert score so that it's between 0 and 1.
        # Recall that shannon entropy is between 0 and log(number of values with nonzero freq)
        # XXX: Why involve len(col) if we have a pseudocount?
        h /= math.log(len(fc))#math.log(min(len(fc), len(col)))

        # Convert score so that 1 is conserved, and 0 is not.
        return 1 - h