def test_shift_overaligned(self): # qwertyuiopas---kkkkk------dfghjklzxcvbnm # .......... ................ gene = Gene('1', 1, 1000, strand='+') transcript = PreTranscript(exons=[(1, 12), (20, 28)], gene=gene, strand='+') for spl_patt in transcript.generate_splicing_patterns(): transcript.transcripts.append(Transcript(transcript, spl_patt)) gene.transcripts.append(transcript) read = SamRead( reference_name='1', reference_start=0, cigar=_cigar.convert_string_to_cigar('14=7D12='), query_sequence='qwertyuiopasdfghjklzxcvbnm', ) evidence = TranscriptomeEvidence( annotations={}, reference_genome={'1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm')}, bam_cache=MockObject(get_read_reference_name=lambda r: r.reference_name), break1=Breakpoint('1', 1, orient='L', strand='+'), break2=Breakpoint('1', 10, orient='R', strand='+'), read_length=75, stdev_fragment_size=75, median_fragment_size=220, ) evidence.overlapping_transcripts.add(transcript) new_read = evidence.standardize_read(read) assert new_read.cigar == _cigar.convert_string_to_cigar('12=7N14=')
def build_transcript(gene, exons, cds_start, cds_end, domains, strand=None, is_best_transcript=False, name=None): pre_transcript = PreTranscript( exons, gene=gene, strand=strand if strand is not None else gene.get_strand(), is_best_transcript=is_best_transcript, name=name) if gene is not None: gene.unspliced_transcripts.append(pre_transcript) for spl in pre_transcript.generate_splicing_patterns(): t = Transcript(pre_transcript, spl) pre_transcript.spliced_transcripts.append(t) tx = Translation(cds_start, cds_end, t, domains=domains) t.translations.append(tx) return pre_transcript
def test_net_zero(self): transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS) for patt in transcript.generate_splicing_patterns(): transcript.transcripts.append(Transcript(transcript, patt)) trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, overlapping_transcripts={transcript}, ) setattr( trans_evidence, '_select_transcripts', lambda *pos: trans_evidence.overlapping_transcripts, ) setattr( trans_evidence, 'distance', partial(TranscriptomeEvidence.distance, trans_evidence), ) bpp = BreakpointPair( Breakpoint('1', 1099, orient=ORIENT.LEFT), Breakpoint('1', 1302, orient=ORIENT.RIGHT), untemplated_seq='TT', ) dist = partial(TranscriptomeEvidence.distance, trans_evidence) assert bpp.net_size() == Interval(-200) assert bpp.net_size(dist) == Interval(0)
class TestNetSizeTrans(unittest.TestCase): def setUp(self): self.transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.POS) for patt in self.transcript.generate_splicing_patterns(): self.transcript.transcripts.append( Transcript(self.transcript, patt)) self.trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, overlapping_transcripts={self.transcript}) setattr(self.trans_evidence, '_select_transcripts', lambda *pos: self.trans_evidence.overlapping_transcripts) setattr(self.trans_evidence, 'distance', partial(TranscriptomeEvidence.distance, self.trans_evidence)) def test_net_zero(self): bpp = BreakpointPair(Breakpoint('1', 1099, orient=ORIENT.LEFT), Breakpoint('1', 1302, orient=ORIENT.RIGHT), untemplated_seq='TT') dist = partial(TranscriptomeEvidence.distance, self.trans_evidence) self.assertEqual(Interval(-200), bpp.net_size()) self.assertEqual(Interval(0), bpp.net_size(dist))
def test_empty_intron(self, distance_setup): t2 = PreTranscript([(1001, 1100), (1501, 1600), (2001, 2200), (2201, 2300)], strand='+') for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) print(t2) print(distance_setup.trans_evidence.overlapping_transcripts) distance_setup.trans_evidence.overlapping_transcripts.add(t2) dist = distance_setup.trans_evidence.distance(1001, 2301) assert dist == Interval(400, 400)
class TestDistance(unittest.TestCase): def setUp(self): self.transcript = PreTranscript([(1001, 1100), (1501, 1600), (2001, 2100), (2201, 2300)], strand='+') for patt in self.transcript.generate_splicing_patterns(): self.transcript.transcripts.append( Transcript(self.transcript, patt)) self.trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, overlapping_transcripts={self.transcript}) setattr(self.trans_evidence, '_select_transcripts', lambda *pos: self.trans_evidence.overlapping_transcripts) setattr(self.trans_evidence, 'distance', partial(TranscriptomeEvidence.distance, self.trans_evidence)) def test_exonic(self): self.assertEqual(Interval(149), self.trans_evidence.distance(1001, 1550)) def test_intergenic_exonic(self): dist = self.trans_evidence.distance(101, 1550) self.assertEqual(Interval(1049, 1049), dist) def test_intergenic_intergenic(self): dist = self.trans_evidence.distance(101, 300) self.assertEqual(Interval(199), dist) def test_aligned_intronic(self): dist = self.trans_evidence.distance(1102, 1499) self.assertEqual(Interval(5), dist) def test_indel_at_exon_boundary(self): self.assertEqual(Interval(2), self.trans_evidence.distance(1101, 1501)) def test_no_annotations(self): dist = self.trans_evidence.distance(101, 300, []) self.assertEqual(Interval(199), dist) def test_intergenic_intronic(self): dist = self.trans_evidence.distance(101, 1400) self.assertEqual(Interval(1101), dist) def test_empty_intron(self): t2 = PreTranscript([(1001, 1100), (1501, 1600), (2001, 2200), (2201, 2300)], strand='+') for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) print(t2) print(self.trans_evidence.overlapping_transcripts) self.trans_evidence.overlapping_transcripts.add(t2) dist = self.trans_evidence.distance(1001, 2301) self.assertEqual(Interval(400, 400), dist)
def test_multiple_transcripts(self, trans_window_setup): # [(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)]) b = Breakpoint(chr='1', start=1150, orient=ORIENT.RIGHT) gene = trans_window_setup.annotations['1'][0] t2 = PreTranscript(gene=gene, exons=[(1001, 1100), (1200, 1300), (2100, 2200)]) for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) gene.transcripts.append(t2) # 989 - 2561 # 989 - 3411 assert transcriptome_window( trans_window_setup.trans_evidence, b, [trans_window_setup.pre_transcript, t2] ) == Interval(1040, 3160)
def test_many_small_exons(self): g = Gene('fake', 17271277, 17279592, strand='+') pre_transcript = PreTranscript( gene=g, exons=[ (17271277, 17271984), (17272649, 17272709), (17275586, 17275681), (17275769, 17275930), (17276692, 17276817), (17277168, 17277388), # 220 (17277845, 17277888), # 44 (17278293, 17278378), # 86 (17279229, 17279592) # 364 ]) g.transcripts.append(pre_transcript) for patt in pre_transcript.generate_splicing_patterns(): pre_transcript.transcripts.append(Transcript(pre_transcript, patt)) b = Breakpoint(chr='fake', start=17279591, orient=ORIENT.LEFT) self.assertEqual(Interval(17277321, 17279701), self.transcriptome_window(b, [pre_transcript]))
def test_single_exon(self): t = PreTranscript([(3, 4)], strand=STRAND.POS) patt = t.generate_splicing_patterns() assert len(patt) == 1 assert len(patt[0]) == 0 assert patt[0].splice_type == SPLICE_TYPE.NORMAL
class TestTranscriptomeEvidenceWindow(unittest.TestCase): def setUp(self): gene = Gene('1', 1, 9999, name='KRAS', strand=STRAND.POS) self.pre_transcript = PreTranscript(gene=gene, exons=[(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)]) gene.unspliced_transcripts.append(self.pre_transcript) for spl in self.pre_transcript.generate_splicing_patterns(): self.pre_transcript.transcripts.append( Transcript(self.pre_transcript, spl)) self.annotations = {gene.chr: [gene]} self.genome_evidence = MockObject(annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11) self.trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, overlapping_transcripts={self.pre_transcript}, ) setattr( self.trans_evidence, '_select_transcripts', lambda *pos: self.trans_evidence.overlapping_transcripts, ) setattr( self.trans_evidence, 'traverse', partial(TranscriptomeEvidence.traverse, self.trans_evidence), ) def transcriptome_window(self, breakpoint, transcripts=None): if transcripts: self.trans_evidence.overlapping_transcripts.update(transcripts) return TranscriptomeEvidence.generate_window(self.trans_evidence, breakpoint) def genome_window(self, breakpoint): return GenomeEvidence.generate_window(self.genome_evidence, breakpoint) def test_before_start(self): b = Breakpoint(chr='1', start=100, orient=ORIENT.RIGHT) self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) b = Breakpoint(chr='1', start=500, orient=ORIENT.RIGHT) self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) def test_after_end(self): b = Breakpoint(chr='1', start=6000, orient=ORIENT.RIGHT) self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) def test_exonic_long_exon(self): b = Breakpoint(chr='1', start=3200, orient=ORIENT.RIGHT) self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) def test_intronic_long_exon(self): b = Breakpoint(chr='1', start=2970, orient=ORIENT.RIGHT) self.assertEqual(self.genome_window(b), self.transcriptome_window(b)) def test_intronic_long_intron(self): b = Breakpoint(chr='1', start=1800, orient=ORIENT.RIGHT) print(self.genome_window(b)) self.assertEqual(Interval(1490, 2360), self.transcriptome_window(b)) def test_intronic_short_exon_right(self): b = Breakpoint(chr='1', start=1690, orient=ORIENT.RIGHT) print(self.genome_window(b)) self.assertEqual(Interval(1580, 3500), self.transcriptome_window(b)) def test_intronic_short_exon_left(self): b = Breakpoint(chr='1', start=2200, orient=ORIENT.LEFT) self.assertEqual(Interval(1440, 2310), self.transcriptome_window(b)) def test_multiple_transcripts(self): # [(1001, 1100), (1401, 1500), (1701, 1750), (3001, 4000)]) b = Breakpoint(chr='1', start=1150, orient=ORIENT.RIGHT) gene = self.annotations['1'][0] t2 = PreTranscript(gene=gene, exons=[(1001, 1100), (1200, 1300), (2100, 2200)]) for patt in t2.generate_splicing_patterns(): t2.transcripts.append(Transcript(t2, patt)) gene.transcripts.append(t2) # 989 - 2561 # 989 - 3411 self.assertEqual( Interval(1040, 3160), self.transcriptome_window(b, [self.pre_transcript, t2])) def test_many_small_exons(self): g = Gene('fake', 17271277, 17279592, strand='+') pre_transcript = PreTranscript( gene=g, exons=[ (17271277, 17271984), (17272649, 17272709), (17275586, 17275681), (17275769, 17275930), (17276692, 17276817), (17277168, 17277388), # 220 (17277845, 17277888), # 44 (17278293, 17278378), # 86 (17279229, 17279592), # 364 ], ) g.transcripts.append(pre_transcript) for patt in pre_transcript.generate_splicing_patterns(): pre_transcript.transcripts.append(Transcript(pre_transcript, patt)) b = Breakpoint(chr='fake', start=17279591, orient=ORIENT.LEFT) self.assertEqual(Interval(17277321, 17279701), self.transcriptome_window(b, [pre_transcript]))
class TestTraverseTransRev(unittest.TestCase): def setUp(self): self.transcript = PreTranscript([(1001, 1100), (1301, 1400), (1701, 1800)], strand=STRAND.NEG) for patt in self.transcript.generate_splicing_patterns(): self.transcript.transcripts.append( Transcript(self.transcript, patt)) self.trans_evidence = MockObject( annotations={}, read_length=100, max_expected_fragment_size=550, call_error=11, overlapping_transcripts={self.transcript}, ) setattr( self.trans_evidence, '_select_transcripts', lambda *pos: self.trans_evidence.overlapping_transcripts, ) setattr( self.trans_evidence, 'traverse', partial(TranscriptomeEvidence.traverse, self.trans_evidence), ) def test_left_before_transcript(self): gpos = self.trans_evidence.traverse(900, 500 - 1, ORIENT.LEFT) self.assertEqual(Interval(401), gpos) self.assertEqual(gpos, GenomeEvidence.traverse(900, 500 - 1, ORIENT.LEFT)) def test_left_after_transcript(self): gpos = self.trans_evidence.traverse(2200, 100, ORIENT.LEFT) self.assertEqual(gpos, GenomeEvidence.traverse(2200, 100, ORIENT.LEFT)) self.assertEqual(Interval(2100), gpos) def test_left_after_transcript2(self): gpos = self.trans_evidence.traverse(1900, 500 - 1, ORIENT.LEFT) self.assertEqual(Interval(901), gpos) def test_left_within_transcript_exonic(self): gpos = self.trans_evidence.traverse(1750, 200 - 1, ORIENT.LEFT) self.assertEqual(Interval(1051), gpos) def test_left_within_exon(self): gpos = self.trans_evidence.traverse(1750, 20 - 1, ORIENT.LEFT) self.assertEqual(1731, gpos.start) self.assertEqual(1731, gpos.end) def test_left_within_transcript_intronic(self): gpos = self.trans_evidence.traverse(1600, 150 - 1, ORIENT.LEFT) self.assertEqual(Interval(1451), gpos) def test_right_before_transcript(self): gpos = self.trans_evidence.traverse(500, 100 - 1, ORIENT.RIGHT) self.assertEqual(Interval(599), gpos) def test_right_before_transcript2(self): gpos = self.trans_evidence.traverse(901, 500 - 1, ORIENT.RIGHT) self.assertEqual(Interval(1900), gpos) def test_right_after_transcript(self): gpos = self.trans_evidence.traverse(2201, 100 - 1, ORIENT.RIGHT) self.assertEqual(Interval(2300), gpos) def test_right_within_transcript(self): gpos = self.trans_evidence.traverse(1351, 100 - 1, ORIENT.RIGHT) self.assertEqual(Interval(1750), gpos) def test_right_within_exon(self): gpos = self.trans_evidence.traverse(1351, 10 - 1, ORIENT.RIGHT) self.assertEqual(Interval(1360), gpos)
def test_single_exon(self): t = PreTranscript([(3, 4)], strand=STRAND.POS) patt = t.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual(0, len(patt[0])) self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type)
class TestSplicingPatterns(unittest.TestCase): def setUp(self): self.setup_by_strand(STRAND.POS) def setup_by_strand(self, strand): self.ex1 = Exon(100, 199, strand=strand) # C self.ex2 = Exon(500, 599, strand=strand) # G self.ex3 = Exon(1200, 1299, strand=strand) # T self.ex4 = Exon(1500, 1599, strand=strand) # C self.ex5 = Exon(1700, 1799, strand=strand) # G self.ex6 = Exon(2000, 2099, strand=strand) # C # introns: 99, 300, 600, 200, 100, ... reference_sequence = 'a' * 99 + 'C' * 100 + 'a' * 300 + 'G' * 100 reference_sequence += 'a' * 600 + 'T' * 100 + 'a' * 200 + 'C' * 100 reference_sequence += 'a' * 100 + 'G' * 100 + 'a' * 200 + 'C' * 100 self.reference_sequence = reference_sequence self.pre_transcript = PreTranscript(exons=[self.ex1, self.ex2, self.ex3, self.ex4, self.ex5, self.ex6], strand=strand) def test_single_exon(self): t = PreTranscript([(3, 4)], strand=STRAND.POS) patt = t.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual(0, len(patt[0])) self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) def test_normal_pattern_pos(self): patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual( [ self.ex1.end, self.ex2.start, self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) def test_normal_pattern_neg(self): self.setup_by_strand(STRAND.NEG) self.assertTrue(self.pre_transcript.is_reverse) patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual( [ self.ex1.end, self.ex2.start, self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], sorted([s.pos for s in patt[0]]) ) self.assertEqual(SPLICE_TYPE.NORMAL, patt[0].splice_type) def test_abrogate_a_pos(self): self.ex2.start_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(2, len(patt)) self.assertEqual( [ self.ex1.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) self.assertEqual( [ self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[1]] ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) def test_abrogate_a_neg(self): self.setup_by_strand(STRAND.NEG) self.ex2.start_splice_site.intact = False patt = sorted(self.pre_transcript.generate_splicing_patterns()) self.assertEqual(2, len(patt)) self.assertEqual( [ self.ex1.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], sorted([s.pos for s in patt[0]]) ) self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) self.assertEqual( [ self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], sorted([s.pos for s in patt[1]]) ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) def test_abrogate_a_last_exon(self): self.ex6.start_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual( [ self.ex1.end, self.ex2.start, self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) def test_abrogate_d_first_exon(self): self.ex1.end_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual( [ self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) def test_abrogate_ad(self): self.ex2.start_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(2, len(patt)) self.assertEqual( [ self.ex1.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.SKIP, patt[0].splice_type) self.assertEqual( [ self.ex2.end, self.ex3.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[1]] ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[1].splice_type) def test_abrogate_da(self): self.ex2.end_splice_site.intact = False self.ex3.start_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(1, len(patt)) self.assertEqual( [ self.ex1.end, self.ex2.start, self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.RETAIN, patt[0].splice_type) def test_multiple_exons_or_multiple_introns_abrogate_ada(self): self.ex2.start_splice_site.intact = False self.ex2.end_splice_site.intact = False self.ex3.start_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(2, len(patt)) self.assertEqual( [ self.ex1.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.MULTI_SKIP, patt[0].splice_type) self.assertEqual( [ self.ex3.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[1]] ) self.assertEqual(SPLICE_TYPE.MULTI_RETAIN, patt[1].splice_type) def test_multiple_exons_or_multiple_introns_abrogate_dad(self): self.ex2.end_splice_site.intact = False self.ex3.start_splice_site.intact = False self.ex3.end_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(2, len(patt)) self.assertEqual( [ self.ex1.end, self.ex2.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[0]] ) self.assertEqual(SPLICE_TYPE.MULTI_RETAIN, patt[0].splice_type) self.assertEqual( [ self.ex1.end, self.ex4.start, self.ex4.end, self.ex5.start, self.ex5.end, self.ex6.start ], [s.pos for s in patt[1]] ) self.assertEqual(SPLICE_TYPE.MULTI_SKIP, patt[1].splice_type) def test_complex(self): self.ex2.end_splice_site.intact = False self.ex4.end_splice_site.intact = False patt = self.pre_transcript.generate_splicing_patterns() self.assertEqual(4, len(patt)) self.assertTrue(SPLICE_TYPE.COMPLEX in [p.splice_type for p in patt])