Ejemplo n.º 1
0
 def testTrimZero(self):
     """
     The trim function must work as expected when the trim quantity is 0.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertTrue(ar.trim(0))
     self.assertEqual('ACGTACGT', ar.sequence)
Ejemplo n.º 2
0
 def testBaseNone(self):
     """
     The base method must return C{None} for an offset the read does not
     have.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertEqual(None, ar.base(0))
Ejemplo n.º 3
0
 def testSetSignificantOffsets(self):
     """
     Adding some significant offsets must work as expected.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     ar.setSignificantOffsets([2, 3, 4])
     self.assertEqual({3: 'A', 4: 'C'}, ar.significantOffsets)
Ejemplo n.º 4
0
 def testTrim(self):
     """
     The trim function must work as expected.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertEqual(8, len(ar))
     self.assertTrue(ar.trim(2))
     self.assertEqual('GTAC', ar.sequence)
Ejemplo n.º 5
0
 def testAddOne(self):
     """
     Adding a read must result in the expected nucleotides and read
     being stored.
     """
     read = AlignedRead('id', '---ACGT--')
     read.setSignificantOffsets([3])
     rc = ReadCluster()
     rc.add(read)
     expectedBases = OffsetBases()
     expectedBases.incorporateBase('A')
     self.assertEqual({read}, rc.reads)
     self.assertEqual({3: expectedBases}, rc.nucleotides)
     self.assertEqual(1, len(rc))
Ejemplo n.º 6
0
    def testAddTwo(self):
        """
        Adding two reads must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])

        rc = ReadCluster()
        rc.add(read1)
        rc.add(read2)

        self.assertEqual({read1, read2}, rc.reads)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')

        self.assertEqual({
            3: expectedBases3,
            4: expectedBases4
        }, rc.nucleotides)
Ejemplo n.º 7
0
 def testTrimWithNegativeAmount(self):
     """
     The trim function must raise an AssertionError if the amount to trim
     is negative.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     error = '^Trim amount \(-4\) cannot be negative\.$'
     self.assertRaisesRegex(AssertionError, error, ar.trim, -4)
Ejemplo n.º 8
0
def parseCommandLineOptions(args, returnSignificantOffsets=True):
    """
    Deal with the various command-line options added to the ArgumentParser
    instance by addCommandLineOptions.

    @param args: The result of calling C{parse_args} on an C{ArgumentParser}
        instance (the one that was passed to C{addCommandLineOptions}, unless
        we're testing).
    @param returnSignificantOffsets: If C{True} also return a list of the
        significant offsets (else that element of the return value will be
        C{None}).
    @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM,
        readCountAtOffset, baseCountAtOffset, readsAtOffset,
        significantOffsets).
    """
    genomeLength = None
    alignedReads = []
    samFilter = SAMFilter.parseFilteringOptions(args)

    if samFilter.referenceIds and len(samFilter.referenceIds) > 1:
        raise ValueError('Only one reference id can be given.')

    referenceLengths = samFilter.referenceLengths()

    if len(referenceLengths) == 1:
        referenceId, genomeLength = referenceLengths.popitem()
    else:
        raise ValueError(
            'If you do not specify a reference sequence with '
            '--referenceId, the SAM/BAM file must contain exactly one '
            'reference. But %s contains %d.' %
            (args.samfile, len(referenceLengths)))

    paddedSAM = PaddedSAM(samFilter)

    for query in paddedSAM.queries():
        alignedReads.append(AlignedRead(query.id, query.sequence))

    readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
        genomeLength, alignedReads)

    if returnSignificantOffsets:
        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   args.minReads, args.homogeneousCutoff))
        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)
    else:
        significantOffsets = None

    return (genomeLength, alignedReads, paddedSAM, readCountAtOffset,
            baseCountAtOffset, readsAtOffset, significantOffsets)
Ejemplo n.º 9
0
    def testCommonOffsetsMaxFractionZero(self):
        """
        The commonOffsetsMaxFraction method must return 0.0 when two
        clusters have no offsets in common.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(0.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
Ejemplo n.º 10
0
    def testMultiplicativeDistanceZero(self):
        """
        The multiplicative distance between two clusters must be 0.0 when
        they are identical.
        """
        read1 = AlignedRead('id1', '-----CCG-')
        read1.setSignificantOffsets([5, 6, 7])

        read2 = AlignedRead('id2', '-----CCG-')
        read2.setSignificantOffsets([5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.0,
                               rc.multiplicativeDistance(cluster1, cluster2))
Ejemplo n.º 11
0
    def testMultiplicativeDistanceOneHalf(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 0.5
        when the ratio of two bases in one cluster is 50:50.
        """
        read1 = AlignedRead('id1', '---TCTC-')
        read1.setSignificantOffsets([3, 4, 5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(
            0.5, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
Ejemplo n.º 12
0
    def testNonZeroDistance(self):
        """
        The distance between two clusters must be as expected.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        # Two of the three common offsets do not match.
        self.assertAlmostEqual(
            2 / 3, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
Ejemplo n.º 13
0
    def testMultiplicativeDistanceOne(self):
        """
        The multiplicative distance between two clusters must be 1.0 when
        they have no offsets in common.
        """
        read1 = AlignedRead('id1', '-----CCG-')
        read1.setSignificantOffsets([5, 6, 7])

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(1.0,
                               rc.multiplicativeDistance(cluster1, cluster2))
Ejemplo n.º 14
0
    def testDistanceOne(self):
        """
        The distance between two clusters must be zero when the nucleotides
        they have at their signifcant offsets do not match.
        """
        read1 = AlignedRead('id1', '---ACCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertEqual(
            1.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
Ejemplo n.º 15
0
    def testMultiplicativeDistanceOne(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 1.0
        when two clusters have no offsets in common.
        """
        read1 = AlignedRead('id1', '-----TC-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(
            1.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
Ejemplo n.º 16
0
    def testNonZeroCommonNucleotidesAgreementDistance(self):
        """
        The distance between two clusters must be as expected, as calculated by
        the commonNucleotidesAgreementDistance method.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        # Two of the three common offsets do not match.
        self.assertAlmostEqual(
            2 / 3, rc.commonNucleotidesAgreementDistance(cluster1, cluster2))
Ejemplo n.º 17
0
    def testCommonOffsetsMaxFractionOneHalf(self):
        """
        The commonOffsetsMaxFraction method must return 0.5 when half
        the offsets of one cluster are in common with the offsets of another
        cluster.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 6, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(0.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
Ejemplo n.º 18
0
    def testMultiplicativeDistanceOneQuarterLowOffsetCoverage(self):
        """
        The multiplicative distance between two clusters must be 0.1 when
        they are identical but the maximum fraction of common offsets is
        one half (in which case the 0.9 minimum offset coverage fraction
        will be applied, as explained in the docstring for the
        testMultiplicativeDistanceOneQuarter test above).
        """
        read1 = AlignedRead('id1', '-----CCGTTT')
        read1.setSignificantOffsets([5, 6, 7, 8, 9, 10])

        read2 = AlignedRead('id2', '--TTTCCG-')
        read2.setSignificantOffsets([2, 3, 4, 5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.1,
                               rc.multiplicativeDistance(cluster1, cluster2))
Ejemplo n.º 19
0
    def testMultiplicativeDistanceThreeQuartersLowOffsetCoverage(self):
        """
        The multiplicative distance between two clusters must be 0.55 when
        they agree 50% and the maximum fraction of common offsets is
        0.5 (in which case the 0.9 minimum offset coverage fraction
        will be applied as described in the
        testMultiplicativeDistanceOneQuarter test above) because
        1.0 - (0.5 * max(0.9, 0.5)) = 0.55).
        """
        read1 = AlignedRead('id1', '-----CAGT')
        read1.setSignificantOffsets([5, 6, 7, 8])

        read2 = AlignedRead('id2', '--TTTCC--')
        read2.setSignificantOffsets([2, 3, 4, 5, 6])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.55,
                               rc.multiplicativeDistance(cluster1, cluster2))
Ejemplo n.º 20
0
    def testMultiplicativeDistanceOneQuarter(self):
        """
        The multiplicative distance between two clusters must be 0.1 when
        they are identical and the minimum fraction of common offsets
        (ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN) is 0.9. That fraction
        is used in the following because 3 of the first cluster's sites and
        3 of the second's are in common, and those fractions are 3/4 and 3/6
        which are both less than the 0.9 value of
        ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN so it it used to scale
        the commonNucleotidesAgreementDistance distance.
        """
        read1 = AlignedRead('id1', '-----CCGT')
        read1.setSignificantOffsets([5, 6, 7, 8])

        read2 = AlignedRead('id2', '--TTTCCG-')
        read2.setSignificantOffsets([2, 3, 4, 5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.1,
                               rc.multiplicativeDistance(cluster1, cluster2))
Ejemplo n.º 21
0
    def testMultiplicativeDistanceZero(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 0.0
        when two clusters agree completely on nucleotides at their common
        offsets.
        """
        read1 = AlignedRead('id1', '-----TC-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '-----TC-')
        read2.setSignificantOffsets([5, 6])
        read3 = AlignedRead('id3', '-----TC-')
        read3.setSignificantOffsets([5, 6])
        rc2 = ReadCluster()
        rc2.add(read2)
        rc2.add(read3)

        self.assertAlmostEqual(
            0.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
Ejemplo n.º 22
0
 def testTrimWithReadTooShort(self):
     """
     The trim function must return False if the read is too short.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertFalse(ar.trim(4))
Ejemplo n.º 23
0
    def testFullMerge(self):
        """
        A merge of three reads must result in a cluster with the expected
        nucleotide counts.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])

        read2 = AlignedRead('id2', '---AC--A')
        read2.setSignificantOffsets([3, 4, 7])

        read3 = AlignedRead('id3', '---TG--G')
        read3.setSignificantOffsets([7])

        rc = ReadClusters()
        rc.add(read1)
        rc.add(read2)
        rc.add(read3)

        (cluster, ) = rc.analyze(0.7)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('A')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('C')

        expectedBases5 = OffsetBases()
        expectedBases5.incorporateBase('C')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        expectedBases7 = OffsetBases()
        expectedBases7.incorporateBase('G')
        expectedBases7.incorporateBase('A')
        expectedBases7.incorporateBase('G')

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                5: expectedBases5,
                6: expectedBases6,
                7: expectedBases7,
            }, cluster.nucleotides)
Ejemplo n.º 24
0
 def testBase(self):
     """
     The base method must return the expected nucleotide.
     """
     ar = AlignedRead('id', '---ACGTGCGT--')
     self.assertEqual('A', ar.base(3))
Ejemplo n.º 25
0
 def testToPaddedString(self):
     """
     The toPaddedString method must return the expected string.
     """
     ar = AlignedRead('id', '---ACGTGCGT--')
     self.assertEqual('>id\n---ACGTGCGT--\n', ar.toPaddedString())
Ejemplo n.º 26
0
    def testCommonOffsetsMaxFractionOne(self):
        """
        The commonOffsetsMaxFraction method must return 1.0 when all of
        one of the offsets in one cluster are in the intersection of the
        offsets of another cluster.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4])
        read2 = AlignedRead('id2', '---TCCCG-')
        read2.setSignificantOffsets([6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)
        rc1.add(read2)

        read3 = AlignedRead('id3', '---TG--A')
        read3.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read3)

        self.assertAlmostEqual(1.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
Ejemplo n.º 27
0
    def testDistanceZero(self):
        """
        The distance between two clusters must be 0.0 when they match at all
        common significant offsets.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TC--A')
        read2.setSignificantOffsets([3, 4, 7])
        read3 = AlignedRead('id3', '---TG--G')
        read3.setSignificantOffsets([7])
        rc2 = ReadCluster()
        rc2.add(read2)
        rc2.add(read3)

        # All three common offsets match.
        self.assertEqual(
            0.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
Ejemplo n.º 28
0
    def testMerge(self):
        """
        Merging two clusters must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])
        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])
        rc1 = ReadCluster()
        rc1.add(read1)
        rc1.add(read2)

        read3 = AlignedRead('id3', '---GGCC--')
        read3.setSignificantOffsets([3, 4, 6])
        rc2 = ReadCluster()
        rc2.add(read3)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('G')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('G')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        rc1.merge(rc2)

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                6: expectedBases6,
            }, rc1.nucleotides)
Ejemplo n.º 29
0
    def initialReferenceIdAnalysis(self, referenceId, alignmentFile,
                                   outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """

        # Make sure this reference id is in this alignment file and if so
        # get its length (and check it's the same as the length of the
        # sequence given in the reference file).
        with samfile(alignmentFile) as sam:
            tid = sam.get_tid(referenceId)
            if tid == -1:
                # This referenceId is not in this alignment file.
                self.report('    Reference %s not in alignment file.' %
                            referenceId)
                return
            else:
                genomeLength = sam.lengths[tid]
                # Sanity check.
                assert genomeLength == len(self.referenceGenomes[referenceId])

        if self.plotSAM:
            filename = join(outputDir, 'reads.html')
            self.report('    Saving reads alignment plot to %s' % filename)
            plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}),
                    filename,
                    title=referenceId,
                    jitter=0.45)

        alignedReads = []
        samFilter = SAMFilter(
            alignmentFile,
            referenceIds={referenceId},
            dropDuplicates=True,
            dropSupplementary=True,
            # dropSecondary=True,
            storeQueryIds=True)
        paddedSAM = PaddedSAM(samFilter)
        for query in paddedSAM.queries(addAlignment=True):
            assert len(query) == genomeLength
            alignedReads.append(
                AlignedRead(query.id, query.sequence, query.alignment))

        # Sanity check that all aligned reads have different ids. This
        # should be the case because the padded SAM queries method adds /2,
        # /3 etc to queries that have more than one alignment.
        assert len(alignedReads) == len(set(read.id for read in alignedReads))

        readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
            genomeLength, alignedReads)

        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   self.minReads, self.homogeneousCutoff))

        self.report(
            '    %d alignment%s (of %d unique %s) read from %s' %
            (samFilter.alignmentCount, s(
                samFilter.alignmentCount), len(samFilter.queryIds), 'query'
             if len(samFilter.queryIds) == 1 else 'queries', alignmentFile))
        self.report('    %d of which %s aligned to %s' %
                    (len(alignedReads),
                     'was' if len(alignedReads) == 1 else 'were', referenceId))
        self.report('    Reference genome length %d' % genomeLength)
        self.report('    Found %d significant location%s' %
                    (len(significantOffsets), s(len(significantOffsets))))

        self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset)

        if not significantOffsets:
            self.report('    No significant locations found.')
            return

        if self.saveReducedFASTA:
            self.saveReducedFasta(significantOffsets, outputDir)

        self._plotCoverageAndSignificantLocations(referenceId, alignmentFile,
                                                  readCountAtOffset,
                                                  genomeLength,
                                                  significantOffsets,
                                                  outputDir)

        self.saveSignificantOffsets(significantOffsets, outputDir)

        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)

        self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength,
                                            significantOffsets,
                                            baseCountAtOffset,
                                            readCountAtOffset, outputDir)

        # Save the reference.
        filename = join(outputDir, 'reference.fasta')
        self.report('    Saving reference to', filename)
        reference = self.referenceGenomes[referenceId]
        Reads([reference]).save(filename)

        # Extract a consensus according to bcftools.
        self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir)

        return (genomeLength, alignedReads, readCountAtOffset,
                baseCountAtOffset, readsAtOffset, significantOffsets,
                samFilter, paddedSAM)