Exemple #1
0
 def testInitialized(self):
     """
     The class must have the expected content when initialized.
     """
     ob = OffsetBases()
     ob.incorporateBase('a')
     self.assertEqual({'a'}, ob.commonest)
Exemple #2
0
    def commonNucleotidesMultiplicativeDistance(a, b):
        """
        Measure the distance from one cluster to another, as the sum of the
        multiplied probabilities of nucleotides.

        @param a: A C{ReadCluster} instance.
        @param b: A C{ReadCluster} instance.
        @raise ZeroDivisionError: if C{a} or C{b} has no offsets (neither of
            which should be possible in normal operation).
        @return: The C{float} [0.0, 1.0] distance between C{a} and C{b}.
        """
        aNucleotides = a.nucleotides
        bNucleotides = b.nucleotides
        commonOffsets = set(aNucleotides) & set(bNucleotides)

        if commonOffsets:
            similarity = sum(
                # 1.0 - OffsetBases.multiplicativeDistance(
                #        aNucleotides[offset], bNucleotides[offset])
                1.0 - min(
                    OffsetBases.multiplicativeDistance(aNucleotides[offset],
                                                       bNucleotides[offset]),
                    OffsetBases.homogeneousDistance(aNucleotides[offset],
                                                    bNucleotides[offset]))
                for offset in commonOffsets)
            return 1.0 - (similarity / len(commonOffsets))
        else:
            return 1.0
Exemple #3
0
 def testDraw(self):
     """
     Adding a base that creates a draw must result in the commonest set
     being added to.
     """
     ob = OffsetBases()
     ob.incorporateBase('a')
     ob.incorporateBase('g')
     self.assertEqual({'a', 'g'}, ob.commonest)
Exemple #4
0
 def testAddNewInstanceOfCommonest(self):
     """
     Adding another copy of a base that is already the commonest must
     leave the commonest set unchanged.
     """
     ob = OffsetBases()
     ob.incorporateBase('a')
     ob.incorporateBase('a')
     self.assertEqual({'a'}, ob.commonest)
Exemple #5
0
    def testHomogeneousDistanceThreeQuarters(self):
        """
        The homogeneousDistance method must return 0.75 when all nucleotides
        are equally represented. This is its maximum distance.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('c')
        ob1.incorporateBase('g')

        ob2 = OffsetBases()
        ob2.incorporateBase('t')

        self.assertAlmostEqual(0.75, homogeneousDistance(ob1, ob2))
Exemple #6
0
 def testAddOne(self):
     """
     Adding a read must result in the expected nucleotides and read
     being stored.
     """
     read = AlignedRead('id', '---ACGT--')
     read.setSignificantOffsets([3])
     rc = ReadCluster()
     rc.add(read)
     expectedBases = OffsetBases()
     expectedBases.incorporateBase('A')
     self.assertEqual({read}, rc.reads)
     self.assertEqual({3: expectedBases}, rc.nucleotides)
     self.assertEqual(1, len(rc))
Exemple #7
0
    def testHomogeneousDistanceOneHalf(self):
        """
        The homogeneousDistance method must return 0.5 when the nucleotides
        are equally divided between two choices.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('g')

        ob2 = OffsetBases()
        ob2.incorporateBase('g')

        self.assertAlmostEqual(0.5, homogeneousDistance(ob1, ob2))
Exemple #8
0
    def testAddTwo(self):
        """
        Adding two reads must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])

        rc = ReadCluster()
        rc.add(read1)
        rc.add(read2)

        self.assertEqual({read1, read2}, rc.reads)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')

        self.assertEqual({
            3: expectedBases3,
            4: expectedBases4
        }, rc.nucleotides)
Exemple #9
0
    def commonNucleotidesAgreementDistance(a, b):
        """
        Measure the distance from one cluster to another.

        The distance is 1.0 minus the fraction of common sites that either
        (a) agree on what the most common nucleotide is a the site or else
        (b) where one cluster has an overwhelming opinion about the most
        likely nucleotide.

            In (a), each site in both clusters is examined for its most
            frequent nucleotide set (there may be more than one equally
            frequent nucleotide). If the intersection of the two sets for a
            site is non-empty, that site counts as matching. E.g. if the
            cluster for one site has 6 x A and 2 x C and the same site in
            the other cluster has 3 x A and 3 x G, the sites agree because
            A is in the most common nucleotides set for each cluster.

            In (b) if, e.g., one cluster has 100 x A and the other cluster
            has just 3 x G, then you could argue that the two clusters
            don't really differ at that site because in a merged cluster
            the As would completely overwhelm the Gs. So we count such
            sites as matching too, so long as the numerical dominance of
            one cluster over the other is at least
            self.MIN_COMMONEST_MULTIPLE

        The fraction of common sites matching under (a) or (b) is a measure
        of similarity, so we return 1.0 minus that in order to have a distance.

        @param a: A C{ReadCluster} instance.
        @param b: A C{ReadCluster} instance.
        @return: The C{float} [0.0, 1.0] distance between C{a} and C{b}.

        """
        aNucleotides = a.nucleotides
        bNucleotides = b.nucleotides
        commonOffsets = set(aNucleotides) & set(bNucleotides)

        if commonOffsets:
            matching = 0
            for offset in commonOffsets:
                aNucleotidesAtOffset = aNucleotides[offset]
                bNucleotidesAtOffset = bNucleotides[offset]
                if (aNucleotidesAtOffset.commonest
                        & bNucleotidesAtOffset.commonest):
                    # This is case (a) above.
                    matching += 1
                else:
                    multiple = OffsetBases.highestFrequenciesMultiple(
                        aNucleotidesAtOffset, bNucleotidesAtOffset)
                    # Sanity: the multiple cannot be None because that
                    # would mean only one nucleotide is present, and that
                    # case is dealt with by the first part of this if/then.
                    assert multiple is not None
                    if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE:
                        # This is case (b) above.
                        matching += 1

            return 1.0 - (matching / len(commonOffsets))
        else:
            return 1.0
Exemple #10
0
    def testHighestFrequenciesMultipleOneNucleotide(self):
        """
        When two offsets both have just a single identical nucleotide, the
        highestFrequenciesMultiple method must return None.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')

        self.assertIs(None, highestFrequenciesMultiple(ob1, ob2))
Exemple #11
0
    def testMultiplicativeDistanceOne(self):
        """
        The multiplicativeDistance method must return one when the bases at an
        offset completely disagree.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('t')
        ob1.incorporateBase('c')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')

        self.assertEqual(1.0, multiplicativeDistance(ob1, ob2))
Exemple #12
0
    def testHomogeneousDistanceZero(self):
        """
        The homogeneousDistance method must return zero when both there is no
        discrepancy in the base.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')

        self.assertAlmostEqual(0.0, homogeneousDistance(ob1, ob2))
Exemple #13
0
    def testMultiplicativeDistanceZero(self):
        """
        The multiplicativeDistance method must return zero when the bases at an
        offset completely agree.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')

        self.assertEqual(0.0, multiplicativeDistance(ob1, ob2))
Exemple #14
0
    def testMultiplicativeDistanceOneHalf(self):
        """
        The multiplicativeDistance method must return 0.5 when the two bases
        at an offset are balanced (because (0.5 * 0.5) + (0.5 * 0.5) = 0.25).
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('t')
        ob1.incorporateBase('a')

        ob2 = OffsetBases()
        ob2.incorporateBase('t')
        ob2.incorporateBase('a')

        self.assertEqual(0.5, multiplicativeDistance(ob1, ob2))
Exemple #15
0
 def testCreateDrawThenResolveIt(self):
     """
     Adding a base that creates a draw and then bases that remove the draw
     must result in the commonest set being as expected.
     """
     ob = OffsetBases()
     ob.incorporateBase('a')
     ob.incorporateBase('g')
     ob.incorporateBase('t')
     ob.incorporateBase('t')
     self.assertEqual({'t'}, ob.commonest)
Exemple #16
0
    def testFullMerge(self):
        """
        A merge of three reads must result in a cluster with the expected
        nucleotide counts.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])

        read2 = AlignedRead('id2', '---AC--A')
        read2.setSignificantOffsets([3, 4, 7])

        read3 = AlignedRead('id3', '---TG--G')
        read3.setSignificantOffsets([7])

        rc = ReadClusters()
        rc.add(read1)
        rc.add(read2)
        rc.add(read3)

        (cluster, ) = rc.analyze(0.7)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('A')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('C')

        expectedBases5 = OffsetBases()
        expectedBases5.incorporateBase('C')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        expectedBases7 = OffsetBases()
        expectedBases7.incorporateBase('G')
        expectedBases7.incorporateBase('A')
        expectedBases7.incorporateBase('G')

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                5: expectedBases5,
                6: expectedBases6,
                7: expectedBases7,
            }, cluster.nucleotides)
Exemple #17
0
    def testMerge(self):
        """
        Merging two clusters must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])
        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])
        rc1 = ReadCluster()
        rc1.add(read1)
        rc1.add(read2)

        read3 = AlignedRead('id3', '---GGCC--')
        read3.setSignificantOffsets([3, 4, 6])
        rc2 = ReadCluster()
        rc2.add(read3)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('G')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('G')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        rc1.merge(rc2)

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                6: expectedBases6,
            }, rc1.nucleotides)
Exemple #18
0
    def testHomogeneousDistancePointOne(self):
        """
        The homogeneousDistance method must return 0.1 when the maximum
        nucleotide fraction is 0.9.
        """
        ob1 = OffsetBases()
        for _ in range(40):
            ob1.incorporateBase('a')
        for _ in range(3):
            ob1.incorporateBase('g')

        ob2 = OffsetBases()
        for _ in range(50):
            ob2.incorporateBase('a')
        for _ in range(7):
            ob2.incorporateBase('g')

        self.assertAlmostEqual(0.1, homogeneousDistance(ob1, ob2))
Exemple #19
0
    def testMultiplicativeDistanceFiveNinths(self):
        """
        The multiplicativeDistance method must return 4/9 when both
        have AAT because (2/3 * 2/3) + (1/3 * 1/3) = 5/9 (and 1 - 5/9 = 4/9).
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('t')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')
        ob2.incorporateBase('a')
        ob2.incorporateBase('t')

        self.assertAlmostEqual(4 / 9, multiplicativeDistance(ob1, ob2))
Exemple #20
0
    def mergeDescriptionWithOffsetScores(self, a, b, distance):
        """
        Make a textual description of a cluster merge, including per-offset
        score information.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        offsetScores = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    matches.append('')

                offsetScores.append('%.3f' % min(
                    OffsetBases.multiplicativeDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset]),
                    OffsetBases.homogeneousDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])))
            else:
                matches.append('')
                offsetScores.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)
        offsetScoresWidth = max(len(line) for line in offsetScores)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2,
             offsetScoresWidth, offsetScore, match)
            for (offset, line1, line2, offsetScore, match
                 ) in zip(allOffsets, result1, result2, offsetScores, matches)
        ])
Exemple #21
0
    def testHighestFrequenciesMultipleTwo(self):
        """
        When the frequency of the most common nucleotide in the sum of two
        OffsetBases instance is twice the second most common, the
        highestFrequenciesMultiple method must return 2.0.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('c')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')
        ob2.incorporateBase('a')
        ob2.incorporateBase('a')
        ob2.incorporateBase('a')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')
        ob2.incorporateBase('g')
        ob2.incorporateBase('t')

        # The total count has 10 x 'a' and 5 x 'c'.
        self.assertEqual(2.0, highestFrequenciesMultiple(ob1, ob2))
Exemple #22
0
    def mergeDescription(self, a, b, distance):
        """
        Make a textual description of a cluster merge.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    multiple = OffsetBases.highestFrequenciesMultiple(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])
                    # Sanity: the multiple cannot be None because that
                    # would mean only one nucleotide is present, and that
                    # case is dealt with by the first part of this if/then.
                    assert multiple is not None
                    if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE:
                        matchCount += 1
                        matches.append('+')
                    else:
                        matches.append('')
            else:
                matches.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2, match)
            for (offset, line1, line2,
                 match) in zip(allOffsets, result1, result2, matches)
        ])
Exemple #23
0
    def testHighestFrequenciesMultipleDraw(self):
        """
        When the frequency of the two most common nucleotides in the sum of two
        OffsetBases instance is the same, the highestFrequenciesMultiple method
        must return 1.0.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('c')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')
        ob2.incorporateBase('g')
        ob2.incorporateBase('t')

        # The total count has 4 x 'a' and 4 x 'c'.
        self.assertEqual(1.0, highestFrequenciesMultiple(ob1, ob2))
Exemple #24
0
    def analyzeReferenceId(self, referenceId, alignmentFile, outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """
        analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile,
                                                   outputDir)

        if analysis:
            (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset,
             readsAtOffset, significantOffsets, samFilter,
             paddedSAM) = analysis
        else:
            return

        insignificantOffsets = set(
            range(genomeLength)) - set(significantOffsets)

        reference = self.referenceGenomes[referenceId]
        referenceSequence = reference.sequence

        consensus = []
        for base in referenceSequence:
            ob = OffsetBases()
            ob.incorporateBase(base)
            consensus.append(ob)

        readQueue = PriorityQueue()
        self.updatePriorityQueue(readQueue, alignedReads, consensus,
                                 significantOffsets)

        consensusFilename = join(outputDir, 'reference-consensus.sam')
        nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam')
        self.report('    Writing consensus SAM to', consensusFilename)
        self.report('    Writing non-consensus SAM to', nonConsensusFilename)

        with samfile(alignmentFile) as sam:
            consensusAlignment = AlignmentFile(consensusFilename,
                                               mode='w',
                                               template=sam)
            nonConsensusAlignment = AlignmentFile(nonConsensusFilename,
                                                  mode='w',
                                                  template=sam)

        # Reads with no significant offsets get written to both output files.
        readsWithNoSignificantOffsetsCount = 0
        for read in alignedReads:
            if not read.significantOffsets:
                readsWithNoSignificantOffsetsCount += 1
                consensusAlignment.write(read.alignment)
                nonConsensusAlignment.write(read.alignment)

                for offset in insignificantOffsets:
                    base = read.base(offset)
                    if base is not None:
                        consensus[offset].incorporateBase(base)

        self.report('    %d read%s did not overlap any significant offsets' %
                    (readsWithNoSignificantOffsetsCount,
                     s(readsWithNoSignificantOffsetsCount)))

        readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0
        cutoff = self.cutoff
        while readQueue:
            mismatchFraction, _ = readQueue.lowestPriority()
            read = readQueue.pop()
            if mismatchFraction <= cutoff:
                # We want this read. Incorporate it into the consensus.
                readsMatchingConsensusCount += 1
                consensusAlignment.write(read.alignment)
                affectedReads = set()
                for offset in read.significantOffsets:
                    readBase = read.base(offset)
                    consensus[offset].incorporateBase(readBase)
                    for readAtOffset in readsAtOffset[offset]:
                        if readAtOffset in readQueue:
                            affectedReads.add(readAtOffset)
                self.updatePriorityQueue(readQueue, affectedReads, consensus,
                                         significantOffsets)
            else:
                readsNotMatchingConsensusCount += 1
                nonConsensusAlignment.write(read.alignment)

        consensusAlignment.close()
        nonConsensusAlignment.close()

        self.report(
            '    %d read%s matched the consensus, %d did not.' %
            (readsMatchingConsensusCount, s(readsMatchingConsensusCount),
             readsNotMatchingConsensusCount))

        # Remove the reference bases from the consensus.
        for offset, base in enumerate(referenceSequence):
            consensus[offset].unincorporateBase(base)

        consensusInfoFilename = join(outputDir, 'reference-consensus.txt')
        self.report('    Writing consensus info to', consensusInfoFilename)

        with open(consensusInfoFilename, 'w') as fp:
            consensusSequence = []
            for offset in range(genomeLength):
                # Take a copy of the commonest set because we may pop from
                # it below.
                commonest = set(consensus[offset].commonest)
                referenceBase = referenceSequence[offset]

                if len(commonest) > 1:
                    nucleotides = ' Nucleotides: %s' % (
                        consensus[offset].baseCountsToStr())
                else:
                    nucleotides = ''

                if referenceBase in commonest:
                    consensusBase = referenceBase
                else:
                    if len(commonest) == 1:
                        # Nothing in the included reads covers this offset.
                        consensusBase = '-'
                    elif len(commonest) > 1:
                        # Report a draw (in which the reference base is not
                        # included and so cannot be used to break the draw).
                        commonest.pop()
                    else:
                        consensusBase = commonest.pop()

                consensusSequence.append(consensusBase)

                mismatch = '' if referenceBase == consensusBase else (
                    ' Mismatch (reference has %s)' % referenceBase)

                print('%d: %s%s%s' %
                      (offset + 1, consensusBase, mismatch, nucleotides),
                      file=fp)

        consensusRead = Read('gready-consensus-%s' % referenceId,
                             ''.join(consensusSequence))
        consensusFilename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Writing gready consensus info to', consensusFilename)
        Reads([consensusRead]).save(consensusFilename)

        return {
            'consensusRead': consensusRead,
            'significantOffsets': significantOffsets,
        }
Exemple #25
0
    def testMultiplicativeDistanceSixSixteenths(self):
        """
        The multiplicativeDistance method must return 10/16 when both
        have AATC because (0.5 * 0.5) + (0.25 * 0.25) + (0.25 * 0.25) =
        4/16 + 1/16 + 1/16 = 6/16 (and 1 - 6/16 = 10/16).
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('a')
        ob1.incorporateBase('c')
        ob1.incorporateBase('t')

        ob2 = OffsetBases()
        ob2.incorporateBase('a')
        ob2.incorporateBase('a')
        ob2.incorporateBase('c')
        ob2.incorporateBase('t')

        self.assertEqual(10 / 16, multiplicativeDistance(ob1, ob2))
Exemple #26
0
    def testMerge(self):
        """
        Merging another instance must result in the commonest set being
        as expected.
        """
        ob1 = OffsetBases()
        ob1.incorporateBase('a')
        ob1.incorporateBase('g')
        ob1.incorporateBase('t')
        ob1.incorporateBase('t')

        ob2 = OffsetBases()
        ob2.incorporateBase('g')
        ob2.incorporateBase('c')
        ob2.incorporateBase('c')

        ob1.merge(ob2)

        self.assertEqual({'c', 'g', 't'}, ob1.commonest)