Beispiel #1
0
    def commonNucleotidesMultiplicativeDistance(a, b):
        """
        Measure the distance from one cluster to another, as the sum of the
        multiplied probabilities of nucleotides.

        @param a: A C{ReadCluster} instance.
        @param b: A C{ReadCluster} instance.
        @raise ZeroDivisionError: if C{a} or C{b} has no offsets (neither of
            which should be possible in normal operation).
        @return: The C{float} [0.0, 1.0] distance between C{a} and C{b}.
        """
        aNucleotides = a.nucleotides
        bNucleotides = b.nucleotides
        commonOffsets = set(aNucleotides) & set(bNucleotides)

        if commonOffsets:
            similarity = sum(
                # 1.0 - OffsetBases.multiplicativeDistance(
                #        aNucleotides[offset], bNucleotides[offset])
                1.0 - min(
                    OffsetBases.multiplicativeDistance(aNucleotides[offset],
                                                       bNucleotides[offset]),
                    OffsetBases.homogeneousDistance(aNucleotides[offset],
                                                    bNucleotides[offset]))
                for offset in commonOffsets)
            return 1.0 - (similarity / len(commonOffsets))
        else:
            return 1.0
Beispiel #2
0
    def mergeDescriptionWithOffsetScores(self, a, b, distance):
        """
        Make a textual description of a cluster merge, including per-offset
        score information.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        offsetScores = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    matches.append('')

                offsetScores.append('%.3f' % min(
                    OffsetBases.multiplicativeDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset]),
                    OffsetBases.homogeneousDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])))
            else:
                matches.append('')
                offsetScores.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)
        offsetScoresWidth = max(len(line) for line in offsetScores)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2,
             offsetScoresWidth, offsetScore, match)
            for (offset, line1, line2, offsetScore, match
                 ) in zip(allOffsets, result1, result2, offsetScores, matches)
        ])