Python AlignedRead Examples, midtools.read.AlignedRead Python Examples

Example #1

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testTrimZero(self):
     """
     The trim function must work as expected when the trim quantity is 0.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertTrue(ar.trim(0))
     self.assertEqual('ACGTACGT', ar.sequence)

Example #2

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testBaseNone(self):
     """
     The base method must return C{None} for an offset the read does not
     have.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertEqual(None, ar.base(0))

Example #3

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testSetSignificantOffsets(self):
     """
     Adding some significant offsets must work as expected.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     ar.setSignificantOffsets([2, 3, 4])
     self.assertEqual({3: 'A', 4: 'C'}, ar.significantOffsets)

Example #4

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testTrim(self):
     """
     The trim function must work as expected.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertEqual(8, len(ar))
     self.assertTrue(ar.trim(2))
     self.assertEqual('GTAC', ar.sequence)

Example #5

0

Show file

 def testAddOne(self):
     """
     Adding a read must result in the expected nucleotides and read
     being stored.
     """
     read = AlignedRead('id', '---ACGT--')
     read.setSignificantOffsets([3])
     rc = ReadCluster()
     rc.add(read)
     expectedBases = OffsetBases()
     expectedBases.incorporateBase('A')
     self.assertEqual({read}, rc.reads)
     self.assertEqual({3: expectedBases}, rc.nucleotides)
     self.assertEqual(1, len(rc))

Example #6

0

Show file

    def testAddTwo(self):
        """
        Adding two reads must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])

        rc = ReadCluster()
        rc.add(read1)
        rc.add(read2)

        self.assertEqual({read1, read2}, rc.reads)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')

        self.assertEqual({
            3: expectedBases3,
            4: expectedBases4
        }, rc.nucleotides)

Example #7

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testTrimWithNegativeAmount(self):
     """
     The trim function must raise an AssertionError if the amount to trim
     is negative.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     error = '^Trim amount \(-4\) cannot be negative\.$'
     self.assertRaisesRegex(AssertionError, error, ar.trim, -4)

Example #8

0

Show file

def parseCommandLineOptions(args, returnSignificantOffsets=True):
    """
    Deal with the various command-line options added to the ArgumentParser
    instance by addCommandLineOptions.

    @param args: The result of calling C{parse_args} on an C{ArgumentParser}
        instance (the one that was passed to C{addCommandLineOptions}, unless
        we're testing).
    @param returnSignificantOffsets: If C{True} also return a list of the
        significant offsets (else that element of the return value will be
        C{None}).
    @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM,
        readCountAtOffset, baseCountAtOffset, readsAtOffset,
        significantOffsets).
    """
    genomeLength = None
    alignedReads = []
    samFilter = SAMFilter.parseFilteringOptions(args)

    if samFilter.referenceIds and len(samFilter.referenceIds) > 1:
        raise ValueError('Only one reference id can be given.')

    referenceLengths = samFilter.referenceLengths()

    if len(referenceLengths) == 1:
        referenceId, genomeLength = referenceLengths.popitem()
    else:
        raise ValueError(
            'If you do not specify a reference sequence with '
            '--referenceId, the SAM/BAM file must contain exactly one '
            'reference. But %s contains %d.' %
            (args.samfile, len(referenceLengths)))

    paddedSAM = PaddedSAM(samFilter)

    for query in paddedSAM.queries():
        alignedReads.append(AlignedRead(query.id, query.sequence))

    readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
        genomeLength, alignedReads)

    if returnSignificantOffsets:
        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   args.minReads, args.homogeneousCutoff))
        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)
    else:
        significantOffsets = None

    return (genomeLength, alignedReads, paddedSAM, readCountAtOffset,
            baseCountAtOffset, readsAtOffset, significantOffsets)

Example #9

0

Show file

    def testCommonOffsetsMaxFractionZero(self):
        """
        The commonOffsetsMaxFraction method must return 0.0 when two
        clusters have no offsets in common.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(0.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))

Example #10

0

Show file

    def testMultiplicativeDistanceZero(self):
        """
        The multiplicative distance between two clusters must be 0.0 when
        they are identical.
        """
        read1 = AlignedRead('id1', '-----CCG-')
        read1.setSignificantOffsets([5, 6, 7])

        read2 = AlignedRead('id2', '-----CCG-')
        read2.setSignificantOffsets([5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.0,
                               rc.multiplicativeDistance(cluster1, cluster2))

Example #11

0

Show file

    def testMultiplicativeDistanceOneHalf(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 0.5
        when the ratio of two bases in one cluster is 50:50.
        """
        read1 = AlignedRead('id1', '---TCTC-')
        read1.setSignificantOffsets([3, 4, 5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(
            0.5, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))

Example #12

0

Show file

    def testNonZeroDistance(self):
        """
        The distance between two clusters must be as expected.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        # Two of the three common offsets do not match.
        self.assertAlmostEqual(
            2 / 3, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))

Example #13

0

Show file

    def testMultiplicativeDistanceOne(self):
        """
        The multiplicative distance between two clusters must be 1.0 when
        they have no offsets in common.
        """
        read1 = AlignedRead('id1', '-----CCG-')
        read1.setSignificantOffsets([5, 6, 7])

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(1.0,
                               rc.multiplicativeDistance(cluster1, cluster2))

Example #14

0

Show file

    def testDistanceOne(self):
        """
        The distance between two clusters must be zero when the nucleotides
        they have at their signifcant offsets do not match.
        """
        read1 = AlignedRead('id1', '---ACCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertEqual(
            1.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))

Example #15

0

Show file

    def testMultiplicativeDistanceOne(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 1.0
        when two clusters have no offsets in common.
        """
        read1 = AlignedRead('id1', '-----TC-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3, 4])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(
            1.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))

Example #16

0

Show file

    def testNonZeroCommonNucleotidesAgreementDistance(self):
        """
        The distance between two clusters must be as expected, as calculated by
        the commonNucleotidesAgreementDistance method.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        # Two of the three common offsets do not match.
        self.assertAlmostEqual(
            2 / 3, rc.commonNucleotidesAgreementDistance(cluster1, cluster2))

Example #17

0

Show file

    def testCommonOffsetsMaxFractionOneHalf(self):
        """
        The commonOffsetsMaxFraction method must return 0.5 when half
        the offsets of one cluster are in common with the offsets of another
        cluster.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TG--A')
        read2.setSignificantOffsets([3, 4, 6, 7])
        rc2 = ReadCluster()
        rc2.add(read2)

        self.assertAlmostEqual(0.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))

Example #18

0

Show file

    def testMultiplicativeDistanceOneQuarterLowOffsetCoverage(self):
        """
        The multiplicative distance between two clusters must be 0.1 when
        they are identical but the maximum fraction of common offsets is
        one half (in which case the 0.9 minimum offset coverage fraction
        will be applied, as explained in the docstring for the
        testMultiplicativeDistanceOneQuarter test above).
        """
        read1 = AlignedRead('id1', '-----CCGTTT')
        read1.setSignificantOffsets([5, 6, 7, 8, 9, 10])

        read2 = AlignedRead('id2', '--TTTCCG-')
        read2.setSignificantOffsets([2, 3, 4, 5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.1,
                               rc.multiplicativeDistance(cluster1, cluster2))

Example #19

0

Show file

    def testMultiplicativeDistanceThreeQuartersLowOffsetCoverage(self):
        """
        The multiplicative distance between two clusters must be 0.55 when
        they agree 50% and the maximum fraction of common offsets is
        0.5 (in which case the 0.9 minimum offset coverage fraction
        will be applied as described in the
        testMultiplicativeDistanceOneQuarter test above) because
        1.0 - (0.5 * max(0.9, 0.5)) = 0.55).
        """
        read1 = AlignedRead('id1', '-----CAGT')
        read1.setSignificantOffsets([5, 6, 7, 8])

        read2 = AlignedRead('id2', '--TTTCC--')
        read2.setSignificantOffsets([2, 3, 4, 5, 6])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.55,
                               rc.multiplicativeDistance(cluster1, cluster2))

Example #20

0

Show file

    def testMultiplicativeDistanceOneQuarter(self):
        """
        The multiplicative distance between two clusters must be 0.1 when
        they are identical and the minimum fraction of common offsets
        (ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN) is 0.9. That fraction
        is used in the following because 3 of the first cluster's sites and
        3 of the second's are in common, and those fractions are 3/4 and 3/6
        which are both less than the 0.9 value of
        ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN so it it used to scale
        the commonNucleotidesAgreementDistance distance.
        """
        read1 = AlignedRead('id1', '-----CCGT')
        read1.setSignificantOffsets([5, 6, 7, 8])

        read2 = AlignedRead('id2', '--TTTCCG-')
        read2.setSignificantOffsets([2, 3, 4, 5, 6, 7])

        rc = ReadClusters()
        cluster1 = rc.add(read1)
        cluster2 = rc.add(read2)

        self.assertAlmostEqual(0.1,
                               rc.multiplicativeDistance(cluster1, cluster2))

Example #21

0

Show file

    def testMultiplicativeDistanceZero(self):
        """
        The commonNucleotidesMultiplicativeDistance method must return 0.0
        when two clusters agree completely on nucleotides at their common
        offsets.
        """
        read1 = AlignedRead('id1', '-----TC-')
        read1.setSignificantOffsets([5, 6])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '-----TC-')
        read2.setSignificantOffsets([5, 6])
        read3 = AlignedRead('id3', '-----TC-')
        read3.setSignificantOffsets([5, 6])
        rc2 = ReadCluster()
        rc2.add(read2)
        rc2.add(read3)

        self.assertAlmostEqual(
            0.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))

Example #22

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testTrimWithReadTooShort(self):
     """
     The trim function must return False if the read is too short.
     """
     ar = AlignedRead('id', '---ACGTACGT--')
     self.assertFalse(ar.trim(4))

Example #23

0

Show file

    def testFullMerge(self):
        """
        A merge of three reads must result in a cluster with the expected
        nucleotide counts.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])

        read2 = AlignedRead('id2', '---AC--A')
        read2.setSignificantOffsets([3, 4, 7])

        read3 = AlignedRead('id3', '---TG--G')
        read3.setSignificantOffsets([7])

        rc = ReadClusters()
        rc.add(read1)
        rc.add(read2)
        rc.add(read3)

        (cluster, ) = rc.analyze(0.7)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('A')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('C')

        expectedBases5 = OffsetBases()
        expectedBases5.incorporateBase('C')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        expectedBases7 = OffsetBases()
        expectedBases7.incorporateBase('G')
        expectedBases7.incorporateBase('A')
        expectedBases7.incorporateBase('G')

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                5: expectedBases5,
                6: expectedBases6,
                7: expectedBases7,
            }, cluster.nucleotides)

Example #24

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testBase(self):
     """
     The base method must return the expected nucleotide.
     """
     ar = AlignedRead('id', '---ACGTGCGT--')
     self.assertEqual('A', ar.base(3))

Example #25

0

Show file

File: test_alignedRead.py Project: acorg/midtools

 def testToPaddedString(self):
     """
     The toPaddedString method must return the expected string.
     """
     ar = AlignedRead('id', '---ACGTGCGT--')
     self.assertEqual('>id\n---ACGTGCGT--\n', ar.toPaddedString())

Example #26

0

Show file

    def testCommonOffsetsMaxFractionOne(self):
        """
        The commonOffsetsMaxFraction method must return 1.0 when all of
        one of the offsets in one cluster are in the intersection of the
        offsets of another cluster.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4])
        read2 = AlignedRead('id2', '---TCCCG-')
        read2.setSignificantOffsets([6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)
        rc1.add(read2)

        read3 = AlignedRead('id3', '---TG--A')
        read3.setSignificantOffsets([3, 4, 7])
        rc2 = ReadCluster()
        rc2.add(read3)

        self.assertAlmostEqual(1.0,
                               ReadCluster.commonOffsetsMaxFraction(rc1, rc2))

Example #27

0

Show file

    def testDistanceZero(self):
        """
        The distance between two clusters must be 0.0 when they match at all
        common significant offsets.
        """
        read1 = AlignedRead('id1', '---TCCCG-')
        read1.setSignificantOffsets([3, 4, 5, 6, 7])
        rc1 = ReadCluster()
        rc1.add(read1)

        read2 = AlignedRead('id2', '---TC--A')
        read2.setSignificantOffsets([3, 4, 7])
        read3 = AlignedRead('id3', '---TG--G')
        read3.setSignificantOffsets([7])
        rc2 = ReadCluster()
        rc2.add(read2)
        rc2.add(read3)

        # All three common offsets match.
        self.assertEqual(
            0.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))

Example #28

0

Show file

    def testMerge(self):
        """
        Merging two clusters must result in the expected nucleotides and reads
        being stored.
        """
        read1 = AlignedRead('id1', '---ACCC--')
        read1.setSignificantOffsets([3, 4])
        read2 = AlignedRead('id2', '---TG--')
        read2.setSignificantOffsets([3])
        rc1 = ReadCluster()
        rc1.add(read1)
        rc1.add(read2)

        read3 = AlignedRead('id3', '---GGCC--')
        read3.setSignificantOffsets([3, 4, 6])
        rc2 = ReadCluster()
        rc2.add(read3)

        expectedBases3 = OffsetBases()
        expectedBases3.incorporateBase('A')
        expectedBases3.incorporateBase('T')
        expectedBases3.incorporateBase('G')

        expectedBases4 = OffsetBases()
        expectedBases4.incorporateBase('C')
        expectedBases4.incorporateBase('G')

        expectedBases6 = OffsetBases()
        expectedBases6.incorporateBase('C')

        rc1.merge(rc2)

        self.assertEqual(
            {
                3: expectedBases3,
                4: expectedBases4,
                6: expectedBases6,
            }, rc1.nucleotides)

Example #29

0

Show file

File: analysis.py Project: acorg/midtools

    def initialReferenceIdAnalysis(self, referenceId, alignmentFile,
                                   outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """

        # Make sure this reference id is in this alignment file and if so
        # get its length (and check it's the same as the length of the
        # sequence given in the reference file).
        with samfile(alignmentFile) as sam:
            tid = sam.get_tid(referenceId)
            if tid == -1:
                # This referenceId is not in this alignment file.
                self.report('    Reference %s not in alignment file.' %
                            referenceId)
                return
            else:
                genomeLength = sam.lengths[tid]
                # Sanity check.
                assert genomeLength == len(self.referenceGenomes[referenceId])

        if self.plotSAM:
            filename = join(outputDir, 'reads.html')
            self.report('    Saving reads alignment plot to %s' % filename)
            plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}),
                    filename,
                    title=referenceId,
                    jitter=0.45)

        alignedReads = []
        samFilter = SAMFilter(
            alignmentFile,
            referenceIds={referenceId},
            dropDuplicates=True,
            dropSupplementary=True,
            # dropSecondary=True,
            storeQueryIds=True)
        paddedSAM = PaddedSAM(samFilter)
        for query in paddedSAM.queries(addAlignment=True):
            assert len(query) == genomeLength
            alignedReads.append(
                AlignedRead(query.id, query.sequence, query.alignment))

        # Sanity check that all aligned reads have different ids. This
        # should be the case because the padded SAM queries method adds /2,
        # /3 etc to queries that have more than one alignment.
        assert len(alignedReads) == len(set(read.id for read in alignedReads))

        readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
            genomeLength, alignedReads)

        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   self.minReads, self.homogeneousCutoff))

        self.report(
            '    %d alignment%s (of %d unique %s) read from %s' %
            (samFilter.alignmentCount, s(
                samFilter.alignmentCount), len(samFilter.queryIds), 'query'
             if len(samFilter.queryIds) == 1 else 'queries', alignmentFile))
        self.report('    %d of which %s aligned to %s' %
                    (len(alignedReads),
                     'was' if len(alignedReads) == 1 else 'were', referenceId))
        self.report('    Reference genome length %d' % genomeLength)
        self.report('    Found %d significant location%s' %
                    (len(significantOffsets), s(len(significantOffsets))))

        self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset)

        if not significantOffsets:
            self.report('    No significant locations found.')
            return

        if self.saveReducedFASTA:
            self.saveReducedFasta(significantOffsets, outputDir)

        self._plotCoverageAndSignificantLocations(referenceId, alignmentFile,
                                                  readCountAtOffset,
                                                  genomeLength,
                                                  significantOffsets,
                                                  outputDir)

        self.saveSignificantOffsets(significantOffsets, outputDir)

        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)

        self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength,
                                            significantOffsets,
                                            baseCountAtOffset,
                                            readCountAtOffset, outputDir)

        # Save the reference.
        filename = join(outputDir, 'reference.fasta')
        self.report('    Saving reference to', filename)
        reference = self.referenceGenomes[referenceId]
        Reads([reference]).save(filename)

        # Extract a consensus according to bcftools.
        self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir)

        return (genomeLength, alignedReads, readCountAtOffset,
                baseCountAtOffset, readsAtOffset, significantOffsets,
                samFilter, paddedSAM)