def test_inversion_and_deletion(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' evidence = MockObject( interchromosomal=False, opposing_strands=True, break1=MockObject(orient=ORIENT.RIGHT, chr='3'), break2=MockObject(orient=ORIENT.RIGHT, chr='3'), contigs=[MockObject(seq=s, alignments=set())], standardize_read=lambda x: x, contig_aln_max_event_size=DEFAULTS.contig_aln_max_event_size, contig_aln_merge_inner_anchor=5, contig_aln_merge_outer_anchor=DEFAULTS.contig_aln_merge_outer_anchor, contig_aln_min_query_consumption=0.9, contig_aln_min_extend_overlap=DEFAULTS.contig_aln_min_extend_overlap, contig_aln_min_anchor_size=DEFAULTS.contig_aln_min_anchor_size, contig_aln_min_score=DEFAULTS.contig_aln_min_score, outer_window1=Interval(1000, 1200), outer_window2=Interval(2000, 2200), reference_genome=None, bam_cache=mock.Mock(stranded=False) ) read1 = SamRead( reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s, is_reverse=False, reference_name='3', alignment_rank=0 ) read2 = SamRead( reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)], query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', alignment_rank=1 ) raw_alignments = {s: [read1, read2]} align.select_contig_alignments(evidence, raw_alignments) alignments = list(evidence.contigs[0].alignments) self.assertEqual(2, len(alignments))
def test_deletion_repeat(self): qseq = ( 'GAGT' 'GAGACTCTGT' 'GAA' 'AAAGAAAAAAAAAA' 'A' 'ATATATATATATATAAATATA' 'C' 'ATATTATGTATCAAATATATAT' 'TATGTGTAATATACATCATGTATCAAATATATATTATGTATAATATACATCATATATCAAATATATATTATGTG' ) # deleted reference: TATGTGTAATATACATCATGTATCAAA print(qseq[:76], qseq[76:]) read = MockRead('name', reference_name='11_86018001-86018500', reference_start=28, cigar=[(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3), (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21), (CIGAR.X, 1), (CIGAR.EQ, 22), (CIGAR.D, 27), (CIGAR.EQ, 74)], query_sequence=qseq) expected_cigar = [(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3), (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21), (CIGAR.X, 1), (CIGAR.EQ, 22 + 30), (CIGAR.D, 27), (CIGAR.EQ, 74 - 30)] std_cigar = hgvs_standardize_cigar( read, REFERENCE_GENOME[read.reference_name].seq) print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) read.cigar = std_cigar print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) self.assertEqual(expected_cigar, std_cigar)
def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' # original # ATAGGCATCTACG AA CGAACGAGATCGCTACG # ATCTC TTT TTCGAACG # expected # ATAGGCATCT ACGAACGAACGAGATCGCTACG # ATCTCTTTTT CGAACG read = MockRead( 'name', 1, 6, reference_name='1', query_sequence='ATCTCTTTTTCGAACG', cigar=[ (CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.X, 1), (CIGAR.D, 2), (CIGAR.I, 3), (CIGAR.D, 2), (CIGAR.I, 2), (CIGAR.EQ, 6), ], ) print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)})) print(SamRead.insertion_sequences(read)) print(read.query_sequence, len(read.query_sequence)) self.assertEqual( [(CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.I, 6), (CIGAR.D, 5), (CIGAR.EQ, 6)], hgvs_standardize_cigar(read, ref), )
def test_shift_overaligned(self): # qwertyuiopas---kkkkk------dfghjklzxcvbnm # .......... ................ gene = Gene('1', 1, 1000, strand='+') transcript = PreTranscript(exons=[(1, 12), (20, 28)], gene=gene, strand='+') for spl_patt in transcript.generate_splicing_patterns(): transcript.transcripts.append(Transcript(transcript, spl_patt)) gene.transcripts.append(transcript) read = SamRead( reference_name='1', reference_start=0, cigar=_cigar.convert_string_to_cigar('14=7D12='), query_sequence='qwertyuiopasdfghjklzxcvbnm', ) evidence = TranscriptomeEvidence( annotations={}, reference_genome={'1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm')}, bam_cache=MockObject(get_read_reference_name=lambda r: r.reference_name), break1=Breakpoint('1', 1, orient='L', strand='+'), break2=Breakpoint('1', 10, orient='R', strand='+'), read_length=75, stdev_fragment_size=75, median_fragment_size=220, ) evidence.overlapping_transcripts.add(transcript) new_read = evidence.standardize_read(read) assert new_read.cigar == _cigar.convert_string_to_cigar('12=7N14=')
def test_insertions(self): exp = ['kkk', 'kkkk'] read = MockRead(reference_start=0, reference_name='1', query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz', cigar=convert_string_to_cigar('5=3I12=4I9=')) self.assertEqual(exp, SamRead.insertion_sequences(read))
def test_deletions(self): exp = ['cde', 'nopq'] read = MockRead( reference_start=0, reference_name='1', query_sequence='', cigar=convert_string_to_cigar('2=3D8=4D9=') ) self.assertEqual(exp, SamRead.deletion_sequences(read, self.reference_genome))
def test_odd_deletion_in_repeat(self): rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'ATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' print(len(qseq) - 28) read = MockRead('name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=3D63='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=3D52=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) self.assertEqual(exp, new_cigar)
def test_complex(self): qseq = ( 'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'C' 'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT' 'TCTCTCTCTCTCTCTCTCTCTCTCTC' 'TCTATATATATATATATA' 'T' 'A' 'T' 'C' 'T' 'ACACACACACACACACAC') rseq = ( 'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'T' 'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT' 'TCTATATATATATATATA' 'C' 'A' 'C' 'ACACACACACACACACAC') read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=[ (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 49), (CIGAR.I, 26), (CIGAR.EQ, 18), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)] ) print(rseq) print(read.query_sequence[:83], read.query_sequence[83 + 26: 83 + 26 + 20], read.query_sequence[83 + 26 + 22:]) print(read.query_sequence) print(SamRead.insertion_sequences(read)) new_cigar = [ (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 52), (CIGAR.I, 26), (CIGAR.EQ, 15), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)] std_cigar = hgvs_standardize_cigar(read, rseq) print(new_cigar) print(std_cigar) self.assertEqual(new_cigar, std_cigar)
def test_bwa_mem(self): # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='') read = SamRead(reference_name='1') read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG' read.reference_start = 224646710 read.reference_id = 0 print(_cigar.convert_string_to_cigar('183=12D19=')) read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19=')) read.query_name = 'name' read.mapping_quality = NA_MAPPING_QUALITY std_read = Evidence.standardize_read(self.mock_evidence, read) print(SamRead.__repr__(read)) print(SamRead.__repr__(std_read)) self.assertEqual(_cigar.convert_string_to_cigar('186=12D16='), std_read.cigar) self.assertEqual(read.reference_start, std_read.reference_start)
def test_deletions(self): exp = ['cde', 'nopq'] read = MockRead( reference_start=0, reference_name='1', query_sequence='', cigar=convert_string_to_cigar('2=3D8=4D9='), ) assert (SamRead.deletion_sequences( read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) == exp)
def test_even_deletion_in_repeat(self): rseq = ('AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') qseq = ('TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') print(len(qseq) - 28) read = MockRead( 'name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2D64='), query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=2D53=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) assert new_cigar == exp
def test_shift_no_transcripts(self): read = SamRead(reference_name='1', reference_start=0, cigar=_cigar.convert_string_to_cigar('14=7D18='), query_sequence='qwertyuiopasdfdfghjklzxcvbnm') evidence = TranscriptomeEvidence( annotations={}, reference_genome={ '1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm') }, bam_cache=None, break1=Breakpoint('1', 1, orient='L', strand='+'), break2=Breakpoint('1', 10, orient='R', strand='+'), read_length=75, stdev_fragment_size=75, median_fragment_size=220) new_cigar = evidence.exon_boundary_shift_cigar(read) self.assertEqual(_cigar.convert_string_to_cigar('14=7D18='), new_cigar)
def test_bwa_mem(self): mock_evidence = MockObject( reference_genome={ '1': MockObject( seq=MockLongString( 'TGGGTATCAGACACACTGGGTAGCTGAGTGCTCAGAGGAAGATGCGAGGTATTCAGGGAAAGTGTCAGTGGGGTCTCCCAGTGCCTGTTTGGTCCACAGTTAGGAGA' 'GGCCCTGCTTGCACTTCTAATACAGTCCCGGAAAGACGGGGCCAGAACTTAGGAGGGGAGCGCTTTGCAGCAACTTTTCAAGAAAAGGGGAAAATTTAAGCACCATA' 'CTGTTATGTGGTCCTTGTACCCAGAGGCCCTGTTCAGCTCCAGTGATCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGT' 'GTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCT' 'TTCTTCCTTCTACTGCTTAGATCAAGTCTTCAGCAGACATCATGTGACCTTGAGGATGGATGTCACATGCTGGAGGAAACAGAAGGCCGAAACCCTGATGACTTCAC' 'AGAGCTGCCAAAACAGTTCCTGACTGTTTATTCCGGGTCTTTAACAAAGTGATGAAAAGAAATCCTTGCAGTATGAAAACAACTTTTCTATTCCATGGAGCCAAACC' 'TCATTATAACAGATAACGTGACCCTCAGCGATATCCCAAGTATTTTCCTGTTCTCATCTATACTATGGCAAAGGGGCAAATACCTCTCAGTAAAGAAAGAAATAACA' 'ACTTCTATCTTGGGCGAGGCATTTCTTCTGTTAGAACTTTGTACACGGAATAAAATAGATCTGTTTGTGCTTATCTTTCTCCTTAGAATTATTGAATTTGAAGTCTT' 'TCCCAGGGTGGGGGTGGAGTGAAGCTGGGGTTTCATAAGCACATAGATAGTAGTG', offset=224646450, ) ) }, bam_cache=MockObject(get_read_reference_name=lambda x: x.reference_name), config={ 'validate.contig_aln_merge_inner_anchor': 10, 'validate.contig_aln_merge_outer_anchor': 20, **DEFAULTS, }, ) # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='') read = SamRead(reference_name='1') read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG' read.reference_start = 224646710 read.reference_id = 0 print(_cigar.convert_string_to_cigar('183=12D19=')) read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19=')) read.query_name = 'name' read.mapping_quality = NA_MAPPING_QUALITY std_read = Evidence.standardize_read(mock_evidence, read) assert std_read.cigar == _cigar.convert_string_to_cigar('186=12D16=') assert std_read.reference_start == read.reference_start
def test_hardclipping(self): read = SamRead(reference_name='15') read.reference_start = 71491944 read.cigar = _cigar.convert_string_to_cigar('12=1D25=113H') read.query_sequence = 'GTGTGTGGTGTGGGGTGTGTGGTGTGTGTGGTGTGTG' read.is_reverse = True expected_bpp = BreakpointPair( Breakpoint('15', 71491956, orient='L', strand='-'), Breakpoint('15', 71491958, orient='R', strand='-'), untemplated_seq='') events = align.call_read_events(read, is_stranded=True) self.assertEqual(1, len(events)) self.assertEqual(expected_bpp.break1, events[0].break1) self.assertEqual(expected_bpp.break2, events[0].break2)