def testUnsupportedCigarToken(self): cigar = '3M3X6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, seq, quality) self.assertEqual("Unsupported CIGAR token: '3X'.", result.exception.message)
def testInvalidCigar(self): cigar = '3M...6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, seq, quality) self.assertEqual("Invalid CIGAR string: '3M...6M'.", result.exception.message)
def testLongCigar(self): cigar = '10M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBDDDEEE' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, seq, quality) self.assertEqual( "CIGAR string '10M' is too long for sequence 'AAACAACCA'.", result.exception.message)
def testUnsupportedCigarToken(self): cigar = '3M3X6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, seq, quality) self.assertEqual( "Unsupported CIGAR token: '3X'.", result.exception.message)
def testInvalidCigar(self): cigar = '3M...6M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual( "Invalid CIGAR string: '3M...6M'.", result.exception.args[0])
def testUnsupportedCigarToken(self): cigar = '3M3X6M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual( "Unsupported CIGAR token: '3X'.", result.exception.args[0])
def testLongCigar(self): cigar = '10M' inp_sequence = 'AAACAACCA' inp__quality = 'BBBDDDEEE' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual( "CIGAR string '10M' is too long for sequence 'AAACAACCA'.", result.exception.args[0])
def testInvalidCigar(self): cigar = '3M...6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' with self.assertRaises(RuntimeError) as result: apply_cigar(cigar, seq, quality) self.assertEqual( "Invalid CIGAR string: '3M...6M'.", result.exception.message)
def merge_reads(quality_cutoff, read_pair): """ Merge a pair of reads. Also skip reads that don't meet certain criteria. @param quality_cutoff: minimum quality score for a base to be counted @param read_pair: a sequence of two sequences, each with fields from a SAM file record @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair """ read1, read2 = read_pair if read2 and read1[2] != read2[2]: # region mismatch, ignore the read pair. return None filtered_reads = [] rname = None for read in read_pair: if not read: continue (_qname, flag, rname, refpos_str, _mapq, cigar, _rnext, _pnext, _tlen, seq, qual) = read[:11] # ignore optional fields if is_unmapped_read(flag): continue filtered_reads.append(dict(rname=rname, cigar=cigar, seq=seq, qual=qual, pos=int(refpos_str))) if not filtered_reads: return None seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'], filtered_reads[0]['seq'], filtered_reads[0]['qual'], filtered_reads[0]['pos']-1) if len(filtered_reads) == 1: seq2 = qual2 = '' ins2 = None else: seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'], filtered_reads[1]['seq'], filtered_reads[1]['qual'], filtered_reads[1]['pos']-1) mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff) merged_inserts = merge_inserts(ins1, ins2, quality_cutoff) return rname, mseq, merged_inserts, qual1, qual2
def merge_reads(quality_cutoff, read_pair): """ Merge a pair of reads. Also skip reads that don't meet certain criteria. @param quality_cutoff: minimum quality score for a base to be counted @param read_pair: a sequence of two sequences, each with fields from a SAM file record @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair """ read1, read2 = read_pair if read2 and read1[2] != read2[2]: # region mismatch, ignore the read pair. return None filtered_reads = [] for read in read_pair: if not read: continue (_qname, flag, rname, refpos_str, _mapq, cigar, _rnext, _pnext, _tlen, seq, qual) = read[:11] # ignore optional fields if is_unmapped_read(flag): continue filtered_reads.append(dict(rname=rname, cigar=cigar, seq=seq, qual=qual, pos=int(refpos_str))) if not filtered_reads: return None seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'], filtered_reads[0]['seq'], filtered_reads[0]['qual'], filtered_reads[0]['pos']-1) if len(filtered_reads) == 1: seq2 = qual2 = '' ins2 = None else: seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'], filtered_reads[1]['seq'], filtered_reads[1]['qual'], filtered_reads[1]['pos']-1) mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff) merged_inserts = merge_inserts(ins1, ins2, quality_cutoff) return rname, mseq, merged_inserts, qual1, qual2
def testSoftClipPositions(self): cigar = '3S6M' pos = 4 inp__sequence = 'AAACAACCA' inp___quality = 'BBBDDDEEE' exp_sequence = '----CAACCA' exp__quality = '!!!!DDDEEE' mapped = set() soft_clipped = set() expected_mapped = {4, 5, 6, 7, 8, 9} expected_soft_clipped = {1, 2, 3} clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp__sequence, inp___quality, pos=pos, mapped=mapped, soft_clipped=soft_clipped) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts) self.assertEqual(expected_mapped, mapped) self.assertEqual(expected_soft_clipped, soft_clipped)
def testLargeToken(self): cigar = '12M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBBBBBBBBBB' expected_seq = seq expected_quality = quality clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionLowQuality(self): cigar = '3M3I6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBD*DEEEFFF' # @IgnorePep8 expected_seq = 'AAACCACCC' # @IgnorePep8 expected_quality = 'BBBEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({3: ('CAA', 'D*D')}, inserts)
def testSoftClip(self): cigar = '3S6M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBDDDEEE' # @IgnorePep8 expected_seq = 'CAACCA' # @IgnorePep8 expected_quality = 'DDDEEE' # @IgnorePep8 clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testDeletion(self): cigar = '6M3D3M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBDDDEEE' # @IgnorePep8 expected_seq = 'AAACAA---CCA' # @IgnorePep8 expected_quality = 'BBBDDD EEE' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testTrivial(self): cigar = '9M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBBBBBBB' expected_seq = seq expected_quality = quality clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testDeletion(self): cigar = '6M3D3M' inp_sequence = 'AAACAACCA' inp__quality = 'BBBDDDEEE' exp_sequence = 'AAACAA---CCA' exp__quality = 'BBBDDD EEE' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testSoftClip(self): cigar = '3S6M' inp_sequence = 'AAACAACCA' inp__quality = 'BBBDDDEEE' expect_sequence = 'CAACCA' expect__quality = 'DDDEEE' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual(expect_sequence, clipped_seq) self.assertEqual(expect__quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionLowQuality(self): cigar = '3M3I6M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBD*DEEEFFF' exp_sequence = 'AAACCACCC' exp__quality = 'BBBEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({3: ('CAA', 'D*D')}, inserts)
def testLargeToken(self): cigar = '12M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBBBBBBBBBB' exp_sequence = inp_sequence exp__quality = inp__quality clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testTrivial(self): cigar = '9M' inp_sequence = 'AAACAACCA' inp__quality = 'BBBBBBBBB' exp_sequence = inp_sequence exp__quality = inp__quality clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testPadding(self): cigar = '12M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' # @IgnorePep8 pos = 3 expected_seq = '---AAACAACCACCC' # @IgnorePep8 expected_quality = '!!!BBBDDDEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, seq, quality, pos) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testPadding(self): cigar = '12M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBDDDEEEFFF' pos = 3 exp_sequence = '---AAACAACCACCC' exp__quality = '!!!BBBDDDEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar(cigar, inp_sequence, inp__quality, pos) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionLowQuality(self): cigar = '3M3I6M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBD*DEEEFFF' # @IgnorePep8 expected_seq = 'AAACCACCC' # @IgnorePep8 expected_quality = 'BBBEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({3: ('CAA', 'D*D')}, inserts)
def testTrivial(self): cigar = '9M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBBBBBBB' expected_seq = seq expected_quality = quality clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testLargeToken(self): cigar = '12M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBBBBBBBBBB' expected_seq = seq expected_quality = quality clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testSoftClip(self): cigar = '3S6M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBDDDEEE' # @IgnorePep8 expected_seq = 'CAACCA' # @IgnorePep8 expected_quality = 'DDDEEE' # @IgnorePep8 clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testDeletion(self): cigar = '6M3D3M' seq = 'AAACAACCA' # @IgnorePep8 quality = 'BBBDDDEEE' # @IgnorePep8 expected_seq = 'AAACAA---CCA' # @IgnorePep8 expected_quality = 'BBBDDD EEE' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testPadding(self): cigar = '12M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' # @IgnorePep8 pos = 3 expected_seq = '---AAACAACCACCC' # @IgnorePep8 expected_quality = '!!!BBBDDDEEEFFF' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionInsideClipRegionWithOffset(self): cigar = '2M1I2M' seq = 'TAGCT' # @IgnorePep8 quality = 'AABCC' pos = 3 clip_from = 4 clip_to = 20 expected_seq = 'ACT' # @IgnorePep8 expected_quality = 'ACC' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({1: ('G', 'B')}, inserts)
def testInsertionBeforeClip(self): cigar = '3M3I9M' seq = 'AAAGGGCAACCACCC' # @IgnorePep8 quality = 'BBBHHHDDDEEEFFF' # @IgnorePep8 pos = 0 clip_from = 3 clip_to = 8 expected_seq = 'CAACCA' # @IgnorePep8 expected_quality = 'DDDEEE' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({0: ('GGG', 'HHH')}, inserts)
def testInsertionAfterClipping(self): cigar = '3M3I3M' seq = "ACTTAGAAA" # @IgnorePep8 quality = 'AAABBBDDD' pos = 0 clip_from = 0 clip_to = 2 expected_seq = 'ACT' # @IgnorePep8 expected_quality = 'AAA' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionAtEndOfClipping(self): cigar = '3M3I3M' seq = "ACTTAGAAA" # @IgnorePep8 quality = 'AAABBBDDD' pos = 0 clip_from = 0 clip_to = 3 expected_seq = 'ACTA' # @IgnorePep8 expected_quality = 'AAAD' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({3: ('TAG', 'BBB')}, inserts)
def testClipInsertionLowQuality(self): cigar = '6M3I6M' seq = 'AAACAAGGGCCACCC' # @IgnorePep8 quality = 'BBBDDDHH*EEEFFF' # @IgnorePep8 pos = 0 clip_from = 3 clip_to = 8 expected_seq = 'CAACCA' # @IgnorePep8 expected_quality = 'DDDEEE' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({3: ('GGG', 'HH*')}, inserts)
def testClippingEverything(self): cigar = '12M' seq = 'AAACAACCACCC' # @IgnorePep8 quality = 'BBBDDDEEEFFF' # @IgnorePep8 pos = 0 clip_from = 100 clip_to = 108 expected_seq = '' # @IgnorePep8 expected_quality = '' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionAfterClipRegionWithOffset(self): cigar = '5M1I2M' seq = 'TAGCTCAG' # @IgnorePep8 quality = 'AAAAABCC' pos = 10 clip_from = 10 clip_to = 13 expected_seq = 'TAGC' # @IgnorePep8 expected_quality = 'AAAA' expected_inserts = {} clipped_seq, clipped_quality, inserts = apply_cigar( cigar, seq, quality, pos, clip_from, clip_to) self.assertEqual(expected_seq, clipped_seq) self.assertEqual(expected_quality, clipped_quality) self.assertEqual(expected_inserts, inserts)
def testInsertionAfterDeletion(self): cigar = '3M3D3M3I3M' (seq, quality, expected_seq, expected_quality) = ('TTTCCCAAATTT', '111222333444', 'TTT---CCCTTT', '111 222444') expected_inserts = {9: ('AAA', '333')} seq, quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, seq) self.assertEqual(expected_quality, quality) self.assertEqual(expected_inserts, inserts)
def testInsertionAfterInsertion(self): cigar = '3M3I3M3I3M' (seq, quality, expected_seq, expected_quality) = ('TTTGGGCCCAAATTT', '111222333444555', 'TTTCCCTTT', '111333555') expected_inserts = {3: ('GGG', '222'), 6: ('AAA', '444')} seq, quality, inserts = apply_cigar( cigar, seq, quality) self.assertEqual(expected_seq, seq) self.assertEqual(expected_quality, quality) self.assertEqual(expected_inserts, inserts)
def testInsertionAtEndOfClipping(self): cigar = '3M3I3M' inp_sequence = "ACTTAGAAA" inp__quality = 'AAABBBDDD' pos = 0 clip_from = 0 clip_to = 3 exp_sequence = 'ACTA' exp__quality = 'AAAD' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({3: ('TAG', 'BBB')}, inserts)
def testInsertionAfterClipping(self): cigar = '3M3I3M' inp_sequence = "ACTTAGAAA" inp__quality = 'AAABBBDDD' pos = 0 clip_from = 0 clip_to = 2 exp_sequence = 'ACT' exp__quality = 'AAA' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testClipInsertionLowQuality(self): cigar = '6M3I6M' inp_sequence = 'AAACAAGGGCCACCC' inp__quality = 'BBBDDDHH*EEEFFF' pos = 0 clip_from = 3 clip_to = 8 exp_sequence = 'CAACCA' exp__quality = 'DDDEEE' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({3: ('GGG', 'HH*')}, inserts)
def testClippingEverything(self): cigar = '12M' inp_sequence = 'AAACAACCACCC' inp__quality = 'BBBDDDEEEFFF' pos = 0 clip_from = 100 clip_to = 108 exp_sequence = '' exp__quality = '' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({}, inserts)
def testInsertionInsideClipRegionWithOffset(self): cigar = '2M1I2M' inp_sequence = 'TAGCT' inp__quality = 'AABCC' pos = 3 clip_from = 4 clip_to = 20 exp_sequence = 'ACT' exp__quality = 'ACC' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({1: ('G', 'B')}, inserts)
def testInsertionBeforeClip(self): cigar = '3M3I9M' inp_sequence = 'AAAGGGCAACCACCC' inp__quality = 'BBBHHHDDDEEEFFF' pos = 0 clip_from = 3 clip_to = 8 exp_sequence = 'CAACCA' exp__quality = 'DDDEEE' clipped_seq, clipped_quality, inserts = apply_cigar( cigar, inp_sequence, inp__quality, pos, clip_from, clip_to) self.assertEqual(exp_sequence, clipped_seq) self.assertEqual(exp__quality, clipped_quality) self.assertEqual({0: ('GGG', 'HHH')}, inserts)