def testTrimZero(self): """ The trim function must work as expected when the trim quantity is 0. """ ar = AlignedRead('id', '---ACGTACGT--') self.assertTrue(ar.trim(0)) self.assertEqual('ACGTACGT', ar.sequence)
def testBaseNone(self): """ The base method must return C{None} for an offset the read does not have. """ ar = AlignedRead('id', '---ACGTACGT--') self.assertEqual(None, ar.base(0))
def testSetSignificantOffsets(self): """ Adding some significant offsets must work as expected. """ ar = AlignedRead('id', '---ACGTACGT--') ar.setSignificantOffsets([2, 3, 4]) self.assertEqual({3: 'A', 4: 'C'}, ar.significantOffsets)
def testTrim(self): """ The trim function must work as expected. """ ar = AlignedRead('id', '---ACGTACGT--') self.assertEqual(8, len(ar)) self.assertTrue(ar.trim(2)) self.assertEqual('GTAC', ar.sequence)
def testAddOne(self): """ Adding a read must result in the expected nucleotides and read being stored. """ read = AlignedRead('id', '---ACGT--') read.setSignificantOffsets([3]) rc = ReadCluster() rc.add(read) expectedBases = OffsetBases() expectedBases.incorporateBase('A') self.assertEqual({read}, rc.reads) self.assertEqual({3: expectedBases}, rc.nucleotides) self.assertEqual(1, len(rc))
def testAddTwo(self): """ Adding two reads must result in the expected nucleotides and reads being stored. """ read1 = AlignedRead('id1', '---ACCC--') read1.setSignificantOffsets([3, 4]) read2 = AlignedRead('id2', '---TG--') read2.setSignificantOffsets([3]) rc = ReadCluster() rc.add(read1) rc.add(read2) self.assertEqual({read1, read2}, rc.reads) expectedBases3 = OffsetBases() expectedBases3.incorporateBase('A') expectedBases3.incorporateBase('T') expectedBases4 = OffsetBases() expectedBases4.incorporateBase('C') self.assertEqual({ 3: expectedBases3, 4: expectedBases4 }, rc.nucleotides)
def testTrimWithNegativeAmount(self): """ The trim function must raise an AssertionError if the amount to trim is negative. """ ar = AlignedRead('id', '---ACGTACGT--') error = '^Trim amount \(-4\) cannot be negative\.$' self.assertRaisesRegex(AssertionError, error, ar.trim, -4)
def parseCommandLineOptions(args, returnSignificantOffsets=True): """ Deal with the various command-line options added to the ArgumentParser instance by addCommandLineOptions. @param args: The result of calling C{parse_args} on an C{ArgumentParser} instance (the one that was passed to C{addCommandLineOptions}, unless we're testing). @param returnSignificantOffsets: If C{True} also return a list of the significant offsets (else that element of the return value will be C{None}). @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets). """ genomeLength = None alignedReads = [] samFilter = SAMFilter.parseFilteringOptions(args) if samFilter.referenceIds and len(samFilter.referenceIds) > 1: raise ValueError('Only one reference id can be given.') referenceLengths = samFilter.referenceLengths() if len(referenceLengths) == 1: referenceId, genomeLength = referenceLengths.popitem() else: raise ValueError( 'If you do not specify a reference sequence with ' '--referenceId, the SAM/BAM file must contain exactly one ' 'reference. But %s contains %d.' % (args.samfile, len(referenceLengths))) paddedSAM = PaddedSAM(samFilter) for query in paddedSAM.queries(): alignedReads.append(AlignedRead(query.id, query.sequence)) readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData( genomeLength, alignedReads) if returnSignificantOffsets: significantOffsets = list( findSignificantOffsets(baseCountAtOffset, readCountAtOffset, args.minReads, args.homogeneousCutoff)) for read in alignedReads: read.setSignificantOffsets(significantOffsets) else: significantOffsets = None return (genomeLength, alignedReads, paddedSAM, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets)
def testCommonOffsetsMaxFractionZero(self): """ The commonOffsetsMaxFraction method must return 0.0 when two clusters have no offsets in common. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([5, 6]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--A') read2.setSignificantOffsets([3, 4, 7]) rc2 = ReadCluster() rc2.add(read2) self.assertAlmostEqual(0.0, ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
def testMultiplicativeDistanceZero(self): """ The multiplicative distance between two clusters must be 0.0 when they are identical. """ read1 = AlignedRead('id1', '-----CCG-') read1.setSignificantOffsets([5, 6, 7]) read2 = AlignedRead('id2', '-----CCG-') read2.setSignificantOffsets([5, 6, 7]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) self.assertAlmostEqual(0.0, rc.multiplicativeDistance(cluster1, cluster2))
def testMultiplicativeDistanceOneHalf(self): """ The commonNucleotidesMultiplicativeDistance method must return 0.5 when the ratio of two bases in one cluster is 50:50. """ read1 = AlignedRead('id1', '---TCTC-') read1.setSignificantOffsets([3, 4, 5, 6]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--') read2.setSignificantOffsets([3, 4]) rc2 = ReadCluster() rc2.add(read2) self.assertAlmostEqual( 0.5, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
def testNonZeroDistance(self): """ The distance between two clusters must be as expected. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([3, 4, 5, 6, 7]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--A') read2.setSignificantOffsets([3, 4, 7]) rc2 = ReadCluster() rc2.add(read2) # Two of the three common offsets do not match. self.assertAlmostEqual( 2 / 3, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
def testMultiplicativeDistanceOne(self): """ The multiplicative distance between two clusters must be 1.0 when they have no offsets in common. """ read1 = AlignedRead('id1', '-----CCG-') read1.setSignificantOffsets([5, 6, 7]) read2 = AlignedRead('id2', '---TG--') read2.setSignificantOffsets([3, 4]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) self.assertAlmostEqual(1.0, rc.multiplicativeDistance(cluster1, cluster2))
def testDistanceOne(self): """ The distance between two clusters must be zero when the nucleotides they have at their signifcant offsets do not match. """ read1 = AlignedRead('id1', '---ACCCG-') read1.setSignificantOffsets([3, 4, 5, 6, 7]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--A') read2.setSignificantOffsets([3, 4, 7]) rc2 = ReadCluster() rc2.add(read2) self.assertEqual( 1.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
def testMultiplicativeDistanceOne(self): """ The commonNucleotidesMultiplicativeDistance method must return 1.0 when two clusters have no offsets in common. """ read1 = AlignedRead('id1', '-----TC-') read1.setSignificantOffsets([5, 6]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--') read2.setSignificantOffsets([3, 4]) rc2 = ReadCluster() rc2.add(read2) self.assertAlmostEqual( 1.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
def testNonZeroCommonNucleotidesAgreementDistance(self): """ The distance between two clusters must be as expected, as calculated by the commonNucleotidesAgreementDistance method. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([3, 4, 5, 6, 7]) read2 = AlignedRead('id2', '---TG--A') read2.setSignificantOffsets([3, 4, 7]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) # Two of the three common offsets do not match. self.assertAlmostEqual( 2 / 3, rc.commonNucleotidesAgreementDistance(cluster1, cluster2))
def testCommonOffsetsMaxFractionOneHalf(self): """ The commonOffsetsMaxFraction method must return 0.5 when half the offsets of one cluster are in common with the offsets of another cluster. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([5, 6]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TG--A') read2.setSignificantOffsets([3, 4, 6, 7]) rc2 = ReadCluster() rc2.add(read2) self.assertAlmostEqual(0.0, ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
def testMultiplicativeDistanceOneQuarterLowOffsetCoverage(self): """ The multiplicative distance between two clusters must be 0.1 when they are identical but the maximum fraction of common offsets is one half (in which case the 0.9 minimum offset coverage fraction will be applied, as explained in the docstring for the testMultiplicativeDistanceOneQuarter test above). """ read1 = AlignedRead('id1', '-----CCGTTT') read1.setSignificantOffsets([5, 6, 7, 8, 9, 10]) read2 = AlignedRead('id2', '--TTTCCG-') read2.setSignificantOffsets([2, 3, 4, 5, 6, 7]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) self.assertAlmostEqual(0.1, rc.multiplicativeDistance(cluster1, cluster2))
def testMultiplicativeDistanceThreeQuartersLowOffsetCoverage(self): """ The multiplicative distance between two clusters must be 0.55 when they agree 50% and the maximum fraction of common offsets is 0.5 (in which case the 0.9 minimum offset coverage fraction will be applied as described in the testMultiplicativeDistanceOneQuarter test above) because 1.0 - (0.5 * max(0.9, 0.5)) = 0.55). """ read1 = AlignedRead('id1', '-----CAGT') read1.setSignificantOffsets([5, 6, 7, 8]) read2 = AlignedRead('id2', '--TTTCC--') read2.setSignificantOffsets([2, 3, 4, 5, 6]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) self.assertAlmostEqual(0.55, rc.multiplicativeDistance(cluster1, cluster2))
def testMultiplicativeDistanceOneQuarter(self): """ The multiplicative distance between two clusters must be 0.1 when they are identical and the minimum fraction of common offsets (ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN) is 0.9. That fraction is used in the following because 3 of the first cluster's sites and 3 of the second's are in common, and those fractions are 3/4 and 3/6 which are both less than the 0.9 value of ReadClusters.COMMON_OFFSETS_MAX_FRACTION_MIN so it it used to scale the commonNucleotidesAgreementDistance distance. """ read1 = AlignedRead('id1', '-----CCGT') read1.setSignificantOffsets([5, 6, 7, 8]) read2 = AlignedRead('id2', '--TTTCCG-') read2.setSignificantOffsets([2, 3, 4, 5, 6, 7]) rc = ReadClusters() cluster1 = rc.add(read1) cluster2 = rc.add(read2) self.assertAlmostEqual(0.1, rc.multiplicativeDistance(cluster1, cluster2))
def testMultiplicativeDistanceZero(self): """ The commonNucleotidesMultiplicativeDistance method must return 0.0 when two clusters agree completely on nucleotides at their common offsets. """ read1 = AlignedRead('id1', '-----TC-') read1.setSignificantOffsets([5, 6]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '-----TC-') read2.setSignificantOffsets([5, 6]) read3 = AlignedRead('id3', '-----TC-') read3.setSignificantOffsets([5, 6]) rc2 = ReadCluster() rc2.add(read2) rc2.add(read3) self.assertAlmostEqual( 0.0, ReadCluster.commonNucleotidesMultiplicativeDistance(rc1, rc2))
def testTrimWithReadTooShort(self): """ The trim function must return False if the read is too short. """ ar = AlignedRead('id', '---ACGTACGT--') self.assertFalse(ar.trim(4))
def testFullMerge(self): """ A merge of three reads must result in a cluster with the expected nucleotide counts. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([3, 4, 5, 6, 7]) read2 = AlignedRead('id2', '---AC--A') read2.setSignificantOffsets([3, 4, 7]) read3 = AlignedRead('id3', '---TG--G') read3.setSignificantOffsets([7]) rc = ReadClusters() rc.add(read1) rc.add(read2) rc.add(read3) (cluster, ) = rc.analyze(0.7) expectedBases3 = OffsetBases() expectedBases3.incorporateBase('T') expectedBases3.incorporateBase('A') expectedBases4 = OffsetBases() expectedBases4.incorporateBase('C') expectedBases4.incorporateBase('C') expectedBases5 = OffsetBases() expectedBases5.incorporateBase('C') expectedBases6 = OffsetBases() expectedBases6.incorporateBase('C') expectedBases7 = OffsetBases() expectedBases7.incorporateBase('G') expectedBases7.incorporateBase('A') expectedBases7.incorporateBase('G') self.assertEqual( { 3: expectedBases3, 4: expectedBases4, 5: expectedBases5, 6: expectedBases6, 7: expectedBases7, }, cluster.nucleotides)
def testBase(self): """ The base method must return the expected nucleotide. """ ar = AlignedRead('id', '---ACGTGCGT--') self.assertEqual('A', ar.base(3))
def testToPaddedString(self): """ The toPaddedString method must return the expected string. """ ar = AlignedRead('id', '---ACGTGCGT--') self.assertEqual('>id\n---ACGTGCGT--\n', ar.toPaddedString())
def testCommonOffsetsMaxFractionOne(self): """ The commonOffsetsMaxFraction method must return 1.0 when all of one of the offsets in one cluster are in the intersection of the offsets of another cluster. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([3, 4]) read2 = AlignedRead('id2', '---TCCCG-') read2.setSignificantOffsets([6, 7]) rc1 = ReadCluster() rc1.add(read1) rc1.add(read2) read3 = AlignedRead('id3', '---TG--A') read3.setSignificantOffsets([3, 4, 7]) rc2 = ReadCluster() rc2.add(read3) self.assertAlmostEqual(1.0, ReadCluster.commonOffsetsMaxFraction(rc1, rc2))
def testDistanceZero(self): """ The distance between two clusters must be 0.0 when they match at all common significant offsets. """ read1 = AlignedRead('id1', '---TCCCG-') read1.setSignificantOffsets([3, 4, 5, 6, 7]) rc1 = ReadCluster() rc1.add(read1) read2 = AlignedRead('id2', '---TC--A') read2.setSignificantOffsets([3, 4, 7]) read3 = AlignedRead('id3', '---TG--G') read3.setSignificantOffsets([7]) rc2 = ReadCluster() rc2.add(read2) rc2.add(read3) # All three common offsets match. self.assertEqual( 0.0, ReadCluster.commonNucleotidesAgreementDistance(rc1, rc2))
def testMerge(self): """ Merging two clusters must result in the expected nucleotides and reads being stored. """ read1 = AlignedRead('id1', '---ACCC--') read1.setSignificantOffsets([3, 4]) read2 = AlignedRead('id2', '---TG--') read2.setSignificantOffsets([3]) rc1 = ReadCluster() rc1.add(read1) rc1.add(read2) read3 = AlignedRead('id3', '---GGCC--') read3.setSignificantOffsets([3, 4, 6]) rc2 = ReadCluster() rc2.add(read3) expectedBases3 = OffsetBases() expectedBases3.incorporateBase('A') expectedBases3.incorporateBase('T') expectedBases3.incorporateBase('G') expectedBases4 = OffsetBases() expectedBases4.incorporateBase('C') expectedBases4.incorporateBase('G') expectedBases6 = OffsetBases() expectedBases6.incorporateBase('C') rc1.merge(rc2) self.assertEqual( { 3: expectedBases3, 4: expectedBases4, 6: expectedBases6, }, rc1.nucleotides)
def initialReferenceIdAnalysis(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ # Make sure this reference id is in this alignment file and if so # get its length (and check it's the same as the length of the # sequence given in the reference file). with samfile(alignmentFile) as sam: tid = sam.get_tid(referenceId) if tid == -1: # This referenceId is not in this alignment file. self.report(' Reference %s not in alignment file.' % referenceId) return else: genomeLength = sam.lengths[tid] # Sanity check. assert genomeLength == len(self.referenceGenomes[referenceId]) if self.plotSAM: filename = join(outputDir, 'reads.html') self.report(' Saving reads alignment plot to %s' % filename) plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}), filename, title=referenceId, jitter=0.45) alignedReads = [] samFilter = SAMFilter( alignmentFile, referenceIds={referenceId}, dropDuplicates=True, dropSupplementary=True, # dropSecondary=True, storeQueryIds=True) paddedSAM = PaddedSAM(samFilter) for query in paddedSAM.queries(addAlignment=True): assert len(query) == genomeLength alignedReads.append( AlignedRead(query.id, query.sequence, query.alignment)) # Sanity check that all aligned reads have different ids. This # should be the case because the padded SAM queries method adds /2, # /3 etc to queries that have more than one alignment. assert len(alignedReads) == len(set(read.id for read in alignedReads)) readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData( genomeLength, alignedReads) significantOffsets = list( findSignificantOffsets(baseCountAtOffset, readCountAtOffset, self.minReads, self.homogeneousCutoff)) self.report( ' %d alignment%s (of %d unique %s) read from %s' % (samFilter.alignmentCount, s( samFilter.alignmentCount), len(samFilter.queryIds), 'query' if len(samFilter.queryIds) == 1 else 'queries', alignmentFile)) self.report(' %d of which %s aligned to %s' % (len(alignedReads), 'was' if len(alignedReads) == 1 else 'were', referenceId)) self.report(' Reference genome length %d' % genomeLength) self.report(' Found %d significant location%s' % (len(significantOffsets), s(len(significantOffsets)))) self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset) if not significantOffsets: self.report(' No significant locations found.') return if self.saveReducedFASTA: self.saveReducedFasta(significantOffsets, outputDir) self._plotCoverageAndSignificantLocations(referenceId, alignmentFile, readCountAtOffset, genomeLength, significantOffsets, outputDir) self.saveSignificantOffsets(significantOffsets, outputDir) for read in alignedReads: read.setSignificantOffsets(significantOffsets) self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength, significantOffsets, baseCountAtOffset, readCountAtOffset, outputDir) # Save the reference. filename = join(outputDir, 'reference.fasta') self.report(' Saving reference to', filename) reference = self.referenceGenomes[referenceId] Reads([reference]).save(filename) # Extract a consensus according to bcftools. self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir) return (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM)