def testMinScoreNoScores(self): """ A request for reads with alignment scores no lower than a given value must produce an empty result when no alignments have scores. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 0 ref1 2 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, minScore=6) self.assertEqual([], list(sf.alignments()))
def testDropSecondary(self): """ Dropping matches flagged as secondary must give the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 256 ref1 2 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, dropSecondary=True) (alignment,) = list(sf.alignments()) self.assertEqual('query1', alignment.query_name)
def testStoreQueryIds(self): """ If we request that query ids are saved, they must be. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG 123456', 'query2 0 ref1 2 60 2= * 0 0 TC XY', 'query2 0 ref1 2 60 2= * 0 0 TC XY', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, storeQueryIds=True) list(sf.alignments()) self.assertEqual({'query1', 'query2'}, sf.queryIds)
def testAlignmentCount(self): """ When all queries have been yielded, the alignment count must be as expected. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG 123456', 'query2 0 ref1 2 60 2= * 0 0 TC XY', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename) list(sf.alignments()) self.assertEqual(2, sf.alignmentCount)
def testMaxScore(self): """ A request for reads with alignment scores no higher than a given value must produce the expected result when some alignments have scores. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:10', 'query2 0 ref1 2 60 2= * 0 0 TC ZZ', 'query3 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:3', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, maxScore=6) (alignment,) = list(sf.alignments()) self.assertEqual('query3', alignment.query_name)
def testKeepQualityControlFailures(self): """ Keeping matches flagged as quality control failures must give the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 512 ref1 4 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, keepQCFailures=True) (alignment1, alignment2) = list(sf.alignments()) self.assertEqual('query1', alignment1.query_name) self.assertEqual('query2', alignment2.query_name)
def testNoQuality(self): """ If an alignment has * for the quality string, the filter must return an alignment with a C{None} quality value. """ data = '\n'.join([ '@SQ SN:ref LN:10', 'query1 4 * 0 0 6M * 0 0 TCTAGG *', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename) (alignment,) = list(sf.alignments()) self.assertEqual('query1', alignment.query_name) self.assertEqual('TCTAGG', alignment.query_sequence) self.assertIsNone(alignment.query_qualities)
def testHardClippingInCIGARButQueryNotHardClipped(self): """ As documented in https://github.com/acorg/dark-matter/issues/630 we must deal correctly with a case in which the CIGAR string says a query is hard-clipped but the query sequence in the SAM file actually isn't. This can be due to a prior alignment with a soft clip, in which case the full query sequence has to be given before the secondary alignment with the hard clip. """ data = '\n'.join([ '@SQ SN:Chimp-D00220 LN:8', '@SQ SN:D-AM494716 LN:8', '@SQ SN:D-XXX LN:8', '@SQ SN:Chimp-YYY LN:8', 'query1 0 Chimp-D00220 1 0 3S5M * 0 0 TTTTGGTT 12345678', 'query1 256 D-AM494716 1 0 3H5M * 0 0 * *', 'query1 256 D-XXX 1 0 5H3M * 0 0 * *', 'query1 0 Chimp-YYY 1 0 8M * 0 0 * *', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read1, read2, read3, read4) = list(ps.queries(addAlignment=True)) self.assertEqual(Read('query1', 'TGGTT---', '45678!!!'), read1) self.assertEqual('TTTTGGTT', read1.alignment.query_sequence) self.assertEqual(Read('query1/1', 'TGGTT---', '45678!!!'), read2) self.assertEqual('TGGTT', read2.alignment.query_sequence) self.assertEqual(Read('query1/2', 'GTT-----', '678!!!!!'), read3) self.assertEqual('GTT', read3.alignment.query_sequence) self.assertEqual(Read('query1/3', 'TTTTGGTT', '12345678'), read4) self.assertEqual('TTTTGGTT', read4.alignment.query_sequence)
def testMinLength(self): """ A request for reads that are only longer than a certain value should result in the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 0 ref1 2 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: filterRead = ReadFilter(minLength=6).filter sf = SAMFilter(filename, filterRead=filterRead) (alignment,) = list(sf.alignments()) self.assertEqual('query1', alignment.query_name)
def testQueryTooLong(self): """ If the query sequence is longer than the total of the lengths in the CIGAR operations, a ValueError must be raised. """ # This test just returns. It used to be possible to reach the # "Query ... not fully consumed when parsing CIGAR string." # ValueError in sam.py, prior to the fix of # https://github.com/acorg/dark-matter/issues/630 but it is not # possible to get a CIGAR string that has a different total length # from the sequence length through to our code in sam.py because # pysam catches the error. I'm leaving this test here because it # documents that the error checked for in sam.py cannot currently # be reached and the test may become useful. For now it just returns. return data = '\n'.join([ '@SQ SN:ref1 LN:90', 'query1 0 ref1 1 60 4M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) error = ('^Query TCTAGG not fully consumed when parsing CIGAR ' 'string\\.') assertRaisesRegex(self, ValueError, error, list, ps.queries())
def testCloseButNoCIGAR(self): """ An unmapped query with no CIGAR string must be passed through unchanged if dropUnmapped is not specified. """ data = '\n'.join([ '@SQ SN:ref LN:10', 'query1 4 * 0 0 * * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename) (alignment,) = list(sf.alignments()) self.assertEqual('query1', alignment.query_name) self.assertEqual('TCTAGG', alignment.query_sequence) self.assertEqual('ZZZZZZ', ''.join( map(lambda x: chr(x + 33), alignment.query_qualities)))
def testMinAndMaxScore(self): """ A request for reads with alignment scores no lower or higher than given values must produce the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:10', 'query2 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:12', 'query3 0 ref1 2 60 2= * 0 0 TC ZZ', 'query4 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:3', 'query5 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:2', ]).replace(' ', '\t') with dataFile(data) as filename: sf = SAMFilter(filename, minScore=3, maxScore=10) (alignment1, alignment2) = list(sf.alignments()) self.assertEqual('query1', alignment1.query_name) self.assertEqual('query4', alignment2.query_name)
def addCommandLineOptions(parser, outfileDefaultName=None): """ Add standard command-line options to an argument parser. @param parser: An C{ArgumentParser} instance. @param outfileDefaultName: The C{str} output file to use as a default in case the user does not give one on the command line. """ addCommonOptions(parser) SAMFilter.addFilteringOptions(parser) parser.add_argument('--outfile', default=outfileDefaultName, help='The filename to store the resulting HTML.') parser.add_argument('--show', action='store_true', default=False, help='If specified, show the figure interactively.')
def parseCommandLineOptions(args, returnSignificantOffsets=True): """ Deal with the various command-line options added to the ArgumentParser instance by addCommandLineOptions. @param args: The result of calling C{parse_args} on an C{ArgumentParser} instance (the one that was passed to C{addCommandLineOptions}, unless we're testing). @param returnSignificantOffsets: If C{True} also return a list of the significant offsets (else that element of the return value will be C{None}). @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets). """ genomeLength = None alignedReads = [] samFilter = SAMFilter.parseFilteringOptions(args) if samFilter.referenceIds and len(samFilter.referenceIds) > 1: raise ValueError('Only one reference id can be given.') referenceLengths = samFilter.referenceLengths() if len(referenceLengths) == 1: referenceId, genomeLength = referenceLengths.popitem() else: raise ValueError( 'If you do not specify a reference sequence with ' '--referenceId, the SAM/BAM file must contain exactly one ' 'reference. But %s contains %d.' % (args.samfile, len(referenceLengths))) paddedSAM = PaddedSAM(samFilter) for query in paddedSAM.queries(): alignedReads.append(AlignedRead(query.id, query.sequence)) readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData( genomeLength, alignedReads) if returnSignificantOffsets: significantOffsets = list( findSignificantOffsets(baseCountAtOffset, readCountAtOffset, args.minReads, args.homogeneousCutoff)) for read in alignedReads: read.setSignificantOffsets(significantOffsets) else: significantOffsets = None return (genomeLength, alignedReads, paddedSAM, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets)
def testAllMMatch(self): """ A simple all-'M' match must result in the expected padded sequence. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
def testMixedMatchSpecificReferenceButNoMatches(self): """ A request for reads aligned against a reference that exists but that has no matches must result in an empty list. """ data = '\n'.join([ '@SQ SN:ref1 LN:15', '@SQ SN:ref2 LN:15', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename, referenceIds={'ref2'})) self.assertEqual([], list(ps.queries()))
def testReferenceDeletion(self): """ An deletion of reference bases must result in the expected padded sequence (with Ns inserted for the deleted reference bases). """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '-TCNNTAGG-', '!ZZ!!ZZZZ!'), read)
def testQuerySoftClipProtrudesBothSides(self): """ A match with a soft-clipped region that extends to both the left and right of the reference must result in the expected padded sequence. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 4 60 5S5M5S * 0 0 TCTAGGCTGACTAAG ZZZZZZZZZZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', 'TAGGCTGACT', 'ZZZZZZZZZZ'), read)
def testQuerySoftClipReachesRightEdge(self): """ A match with a soft-clipped region that reaches to the right edge of the reference must result in the expected padded sequence. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 5 60 2M4S * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '----TCTAGG', '!!!!ZZZZZZ'), read)
def testQuerySoftClipProtrudesLeft(self): """ A match with a soft-clipped region that extends to the left of the reference must result in the expected padded sequence. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 4S2M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', 'AGG-------', 'ZZZ!!!!!!!'), read)
def testMixedMatch(self): """ A match that is a mix of M, =, and X must result in the expected padded sequence. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
def testDropSecondary(self): """ Dropping matches flagged as secondary must give the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 256 ref1 2 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename, dropSecondary=True)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
def testReferenceSkip(self): """ An skip of reference bases must result in the expected padded sequence with the passed unknown quality character when the unknownQualityChar argument is passed. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries(unknownQualityChar='.')) self.assertEqual(Read('query1', '-TCNNTAGG-', '.ZZ..ZZZZ.'), read)
def testRcNeeded(self): """ A reverse-complemented match (flag = 16) when rcNeeded=True is passed must result in the expected (reverse complemented) padded sequence and reversed quality string. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries(rcNeeded=True)) self.assertEqual(Read('query1', '-CCTAGA---', '!654321!!!'), read)
def testUnequalReferenceLengths(self): """ Passing no reference ids when the references have different lengths must result in an UnequalReferenceLengthError exception. """ data = '\n'.join([ '@SQ SN:id1 LN:90', '@SQ SN:id2 LN:91', ]).replace(' ', '\t') with dataFile(data) as filename: error = ('^Your 2 SAM/BAM file reference sequence lengths ' '\\(id1=90, id2=91\\) are not all identical\\.$') assertRaisesRegex(self, UnequalReferenceLengthError, error, PaddedSAM, SAMFilter(filename))
def testRcSuffix(self): """ A reverse-complemented sequence should have the rcSuffix string added to its id when an rcSuffix value is passed. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read,) = list(ps.queries(rcSuffix='-rc', rcNeeded=True)) self.assertEqual(Read('query1-rc', '-CCTAGA---', '!654321!!!'), read)
def testUnknownReferences(self): """ Passing an unknown reference id to the referenceLengths method must result in an UnknownReference exception. """ data = '\n'.join([ '@SQ SN:id1 LN:90', '@SQ SN:id2 LN:90', ]).replace(' ', '\t') with dataFile(data) as filename: sam = SAMFilter(filename, referenceIds={'unknown'}) error = ("^Reference 'unknown' is not present in the " "SAM/BAM file\\.$") assertRaisesRegex(self, UnknownReference, error, sam.referenceLengths)
def testMinLength(self): """ A request for reads that are only longer than a certain value should result in the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 0 ref1 2 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: filterRead = ReadFilter(minLength=6).filter ps = PaddedSAM(SAMFilter(filename, filterRead=filterRead)) (read,) = list(ps.queries()) self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
def testKeepQualityControlFailures(self): """ Keeping matches flagged as quality control failures must give the expected result. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ', 'query2 512 ref1 4 60 2= * 0 0 TC ZZ', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename, keepQCFailures=True)) (read1, read2) = list(ps.queries()) self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1) self.assertEqual(Read('query2', '---TC-----', '!!!ZZ!!!!!'), read2)
def testNotSecondaryAndNotSupplementaryWithNoSequence(self): """ An alignment with a '*' seq that is not secondary or supplementary must result in a ValueError being raised. """ data = '\n'.join([ '@SQ SN:ref1 LN:10', 'query 0 ref1 3 60 4M * 0 0 * *', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) error = ('^pysam produced an alignment \\(number 1\\) with no ' 'query sequence without previously giving an alignment ' 'with a sequence\\.$') queries = ps.queries() assertRaisesRegex(self, InvalidSAM, error, list, queries)
help=('If given, information about reference sequence insertions will be ' 'printed to standard error. These correspond to "I" CIGAR ' 'operations that for the match would require inserting query bases ' 'into the reference. Because we cannot change the reference (in ' 'fact we typically do not have the reference in the SAM/BAM file), ' 'we cut the inserted bases out of the aligned query and save the ' 'information about what would have been inserted and where. That ' 'information is printed by this option. The output gives the ' '0-based offset where the inserted base would be placed, followed ' 'by a list of the nucleotides that were suggested as being ' 'inserted and the number of times each nucleotide was suggested. ' 'So for example the output might contain "27: T:3, G:10" which ' 'indicates that 13 query (3 with T and 10 with G) matches would ' 'insert a nucleotide into the reference at offset 27.')) SAMFilter.addFilteringOptions(parser) addFASTAFilteringCommandLineOptions(parser) args = parser.parse_args() reads = parseFASTAFilteringCommandLineOptions(args, Reads()) samFilter = SAMFilter.parseFilteringOptions( args, filterRead=reads.filterRead) paddedSAM = PaddedSAM(samFilter) for read in paddedSAM.queries(rcSuffix=args.rcSuffix, rcNeeded=args.rcNeeded): print(read.toString('fasta'), end='') if args.listReferenceInsertions: if paddedSAM.referenceInsertions: print('(0-based) insertions into the reference:\n%s' % nucleotidesToStr(paddedSAM.referenceInsertions, ' '),