def commonNucleotidesMultiplicativeDistance(a, b): """ Measure the distance from one cluster to another, as the sum of the multiplied probabilities of nucleotides. @param a: A C{ReadCluster} instance. @param b: A C{ReadCluster} instance. @raise ZeroDivisionError: if C{a} or C{b} has no offsets (neither of which should be possible in normal operation). @return: The C{float} [0.0, 1.0] distance between C{a} and C{b}. """ aNucleotides = a.nucleotides bNucleotides = b.nucleotides commonOffsets = set(aNucleotides) & set(bNucleotides) if commonOffsets: similarity = sum( # 1.0 - OffsetBases.multiplicativeDistance( # aNucleotides[offset], bNucleotides[offset]) 1.0 - min( OffsetBases.multiplicativeDistance(aNucleotides[offset], bNucleotides[offset]), OffsetBases.homogeneousDistance(aNucleotides[offset], bNucleotides[offset])) for offset in commonOffsets) return 1.0 - (similarity / len(commonOffsets)) else: return 1.0
def mergeDescriptionWithOffsetScores(self, a, b, distance): """ Make a textual description of a cluster merge, including per-offset score information. @param a: An C{int} cluster number. @param b: An C{int} cluster number. @param distance: The C{float} [0.0, 1.0] distance between the clusters. @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}. """ cluster1 = self.readClusters[a] cluster2 = self.readClusters[b] result1 = [] result2 = [] offsetScores = [] matches = [] sharedCount = matchCount = 0 allOffsets = sorted( set(cluster1.nucleotides) | set(cluster2.nucleotides)) for offset in allOffsets: inCount = 0 if offset in cluster1.nucleotides: result1.append(cluster1.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result1.append('-') if offset in cluster2.nucleotides: result2.append(cluster2.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result2.append('-') if inCount == 2: sharedCount += 1 if (cluster1.nucleotides[offset].commonest & cluster2.nucleotides[offset].commonest): matches.append('*') matchCount += 1 else: matches.append('') offsetScores.append('%.3f' % min( OffsetBases.multiplicativeDistance( cluster1.nucleotides[offset], cluster2.nucleotides[offset]), OffsetBases.homogeneousDistance( cluster1.nucleotides[offset], cluster2.nucleotides[offset]))) else: matches.append('') offsetScores.append('') result1Width = max(len(line) for line in result1) result2Width = max(len(line) for line in result2) offsetScoresWidth = max(len(line) for line in offsetScores) return '\n'.join([ ('Merging clusters %d and %d with distance %.2f' % (a, b, distance)), ('Cluster %d has %d read%s, covering %d offset%s' % (a, len(cluster1.reads), s(len(cluster1.reads), ), len(cluster1.nucleotides), s(len(cluster1.nucleotides)))), ('Cluster %d has %d read%s, covering %d offset%s' % (b, len(cluster2.reads), s(len(cluster2.reads)), len(cluster2.nucleotides), s(len(cluster2.nucleotides)))), ('%d matches out of %d shared offsets' % (matchCount, sharedCount)), ] + [ ' %d: %*s %*s %*s %s' % (offset + 1, result1Width, line1, result2Width, line2, offsetScoresWidth, offsetScore, match) for (offset, line1, line2, offsetScore, match ) in zip(allOffsets, result1, result2, offsetScores, matches) ])