def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
class TranscriptTester(unittest.TestCase): tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . . . ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1 TAIR10 exon 5928 8737 . . . Parent=AT1G01020.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) logger = create_null_logger("null") def setUp(self): """Basic creation test.""" self.tr = Transcript() self.tr.logger = self.logger self.tr.chrom = "Chr1" self.tr.source = "TAIR10" self.tr.feature = "mRNA" self.tr.start = 5928 self.tr.end = 8737 self.tr.strand = "+" self.tr.add_exon((5928, 8737)) self.tr.score = None self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1" self.tr.add_exon((8571, 8666), "CDS") self.tr.finalize() self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = 8571 - 5928 + 1 self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid, self.orf.invalid_reason) self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0) def test_invalid_inizialization(self): with self.assertRaises(TypeError): _ = Mikado.loci.Transcript(self.tr_gff_lines[1]) def test_basics(self): self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.exon_num, 1) self.assertEqual(self.tr.monoexonic, True) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons) def test_cds(self): """Test the CDS features. Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False. """ self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8571) self.assertEqual(self.tr.selected_cds_end, 8666) self.assertEqual(self.tr.has_start_codon, True) self.assertEqual(self.tr.has_stop_codon, True) def test_equality(self): new_transcript = self.tr.deepcopy() self.assertTrue(new_transcript == self.tr) new_transcript.strand = None self.assertFalse(new_transcript == self.tr) # They have now a different strand new_transcript.unfinalize() new_transcript.strand = "+" # It becomes a multiexonic transcript, so it must have a strand new_transcript.end = 9737 new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1]) new_exon.strand = "+" new_exon.start = 9000 new_exon.end = 9737 new_transcript.add_exon(new_exon) new_transcript.finalize() self.assertTrue(new_transcript != self.tr) def test_mono_finalising(self): transcript_line = [line for line in self.tr_gff_lines if line.feature == "mRNA" ] self.assertEqual(len(transcript_line), 1, "\n".join([str(line) for line in self.tr_gff_lines])) tr = Mikado.loci.Transcript(transcript_line[0]) exon_lines = [line for line in self.tr_gff_lines if line.is_exon is True and "UTR" not in line.feature.upper()] tr.add_exons(exon_lines) tr.add_exon((8571, 8666), "CDS") tr.finalize() self.assertGreater(tr.three_utr_length, 0) self.assertGreater(tr.five_utr_length, 0) def test_invalid_transcript(self): lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1""" gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n")] self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine) checker = False if gff_lines[0].feature.endswith("transcript") or "RNA" in gff_lines[0].feature.upper(): checker = True self.assertTrue(checker) self.assertTrue(gff_lines[0].is_transcript) transcript = Mikado.loci.Transcript(gff_lines[0]) transcript.logger = self.logger transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): Mikado.transcripts.transcript_methods.finalizing._check_cdna_vs_utr(transcript) def test_utr(self): self.assertEqual(self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])), ("exon", tuple([5928, 8737])), ("CDS", tuple([8571, 8666]), 0), ("UTR", tuple([8667, 8737]))], "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)], self.tr.selected_internal_orf)) self.assertEqual(self.tr.combined_utr, [tuple([5928, 8570]), tuple([8667, 8737])]) self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])], self.tr.five_utr) self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.three_utr_num, 1) self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928) self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_tes, 8737 - 8666, (self.tr.selected_end_distance_from_tes, self.tr.strand)) def test_strip_cds(self): self.tr.strip_cds() self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc. """ self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_negative_orf(self): """Test loading a negative strand ORF onto a monoexonic transcript. This should reverse the ORF.""" self.orf.strand = "-" self.tr.strip_cds(strand_specific=False) self.orf.has_stop_codon = False self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 8571) def test_introns(self): self.assertEqual(self.tr.introns, set([ ]), self.tr.introns ) self.assertEqual(self.tr.combined_cds_introns, set([ ]), self.tr.combined_cds_introns ) self.assertEqual(self.tr.selected_cds_introns, set([ ]), self.tr.selected_cds_introns ) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds( first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.logger = self.logger self.tr.load_orfs([first_orf]) self.tr.load_orfs([second_orf]) self.tr.load_orfs([third_orf]) self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual(self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0) def testDoubleOrf_negative(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds(strand_specific=False) self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "-" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "-" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, # second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "-" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2) self.tr.logger = self.logger self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual(self.tr.number_internal_orfs, 2, ( self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) # self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertEqual(new_transcripts[1].three_utr_length, 0) def test_wrong_orf(self): # This should be added orf = Mikado.parsers.bed12.BED12() orf.chrom = self.tr.id orf.start = 1 orf.end = self.tr.cdna_length + 1 orf.name = "third" orf.strand = "-" orf.score = 0 orf.thick_start = 501 orf.thick_end = 800 orf.block_count = 1 orf.blockSize = self.tr.cdna_length orf.block_sizes = [self.tr.cdna_length] orf.block_starts = [0] orf.rgb = 0 orf.has_start_codon = True orf.has_stop_codon = True orf.transcriptomic = True self.assertFalse(orf.invalid) self.tr.logger = self.logger self.tr.strip_cds() self.tr.strand = "+" self.logger.setLevel("WARNING") # self.tr.load_orfs([orf]) with self.assertLogs("null", level="DEBUG") as cm_out: self.tr.load_orfs([orf]) self.assertFalse(self.tr.is_coding)
class TranscriptTester(unittest.TestCase): tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . . . ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1 TAIR10 exon 5928 8737 . . . Parent=AT1G01020.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) logger = create_null_logger("null") def setUp(self): """Basic creation test.""" self.tr = Transcript() self.tr.logger = self.logger self.tr.chrom = "Chr1" self.tr.source = "TAIR10" self.tr.feature = "mRNA" self.tr.start = 5928 self.tr.end = 8737 self.tr.strand = "+" self.tr.add_exon((5928, 8737)) self.tr.score = None self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1" self.tr.add_exon((8571, 8666), "CDS") self.tr.finalize() self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = 8571 - 5928 + 1 self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid, self.orf.invalid_reason) self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0) def test_invalid_inizialization(self): with self.assertRaises(TypeError): _ = Mikado.loci.Transcript(self.tr_gff_lines[1]) def test_basics(self): self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.exon_num, 1) self.assertEqual(self.tr.monoexonic, True) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons) def test_cds(self): """Test the CDS features. Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False. """ self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8571) self.assertEqual(self.tr.selected_cds_end, 8666) self.assertEqual(self.tr.has_start_codon, True) self.assertEqual(self.tr.has_stop_codon, True) def test_equality(self): new_transcript = self.tr.deepcopy() self.assertTrue(new_transcript == self.tr) new_transcript.strand = None self.assertFalse( new_transcript == self.tr) # They have now a different strand new_transcript.unfinalize() new_transcript.strand = "+" # It becomes a multiexonic transcript, so it must have a strand new_transcript.end = 9737 new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1]) new_exon.strand = "+" new_exon.start = 9000 new_exon.end = 9737 new_transcript.add_exon(new_exon) new_transcript.finalize() self.assertTrue(new_transcript != self.tr) def test_mono_finalising(self): transcript_line = [ line for line in self.tr_gff_lines if line.feature == "mRNA" ] self.assertEqual(len(transcript_line), 1, "\n".join([str(line) for line in self.tr_gff_lines])) tr = Mikado.loci.Transcript(transcript_line[0]) exon_lines = [ line for line in self.tr_gff_lines if line.is_exon is True and "UTR" not in line.feature.upper() ] tr.add_exons(exon_lines) tr.add_exon((8571, 8666), "CDS") tr.finalize() self.assertGreater(tr.three_utr_length, 0) self.assertGreater(tr.five_utr_length, 0) def test_invalid_transcript(self): lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1""" gff_lines = [ Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n") ] self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine) checker = False if gff_lines[0].feature.endswith( "transcript") or "RNA" in gff_lines[0].feature.upper(): checker = True self.assertTrue(checker) self.assertTrue(gff_lines[0].is_transcript) transcript = Mikado.loci.Transcript(gff_lines[0]) transcript.logger = self.logger transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr( transcript) def test_utr(self): self.assertEqual( self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])), ("exon", tuple([5928, 8737])), ("CDS", tuple([8571, 8666]), 0), ("UTR", tuple([8667, 8737]))], "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)], self.tr.selected_internal_orf)) self.assertEqual( self.tr.combined_utr, [tuple([5928, 8570]), tuple([8667, 8737])]) self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])], self.tr.five_utr) self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.three_utr_num, 1) self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928) self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928, self.tr.selected_end_distance_from_tes) self.assertEqual( self.tr.selected_end_distance_from_tes, 8737 - 8666, (self.tr.selected_end_distance_from_tes, self.tr.strand)) def test_strip_cds(self): self.tr.strip_cds() self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc. """ self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_negative_orf(self): """Test loading a negative strand ORF onto a monoexonic transcript. This should reverse the ORF.""" self.orf.strand = "-" self.tr.strip_cds(strand_specific=False) self.orf.has_stop_codon = False self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 8571) def test_introns(self): self.assertEqual(self.tr.introns, set([]), self.tr.introns) self.assertEqual(self.tr.combined_cds_introns, set([]), self.tr.combined_cds_introns) self.assertEqual(self.tr.selected_cds_introns, set([]), self.tr.selected_cds_introns) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue( Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.logger = self.logger self.tr.load_orfs([first_orf]) self.tr.load_orfs([second_orf]) self.tr.load_orfs([third_orf]) self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0) def testDoubleOrf_negative(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds(strand_specific=False) self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "-" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "-" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, # second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "-" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2) self.tr.logger = self.logger self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) # self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertEqual(new_transcripts[1].three_utr_length, 0) def test_wrong_orf(self): # This should be added orf = Mikado.parsers.bed12.BED12() orf.chrom = self.tr.id orf.start = 1 orf.end = self.tr.cdna_length + 1 orf.name = "third" orf.strand = "-" orf.score = 0 orf.thick_start = 501 orf.thick_end = 800 orf.block_count = 1 orf.blockSize = self.tr.cdna_length orf.block_sizes = [self.tr.cdna_length] orf.block_starts = [0] orf.rgb = 0 orf.has_start_codon = True orf.has_stop_codon = True orf.transcriptomic = True self.assertFalse(orf.invalid) self.tr.logger = self.logger self.tr.strip_cds() self.tr.strand = "+" self.logger.setLevel("WARNING") # self.tr.load_orfs([orf]) with self.assertLogs("null", level="DEBUG") as cm_out: self.tr.load_orfs([orf]) self.assertFalse(self.tr.is_coding)
class TestMetricsEndDistances(unittest.TestCase): logger = create_default_logger("End") logger.setLevel("ERROR") def setUp(self): self.tr = Transcript() self.tr.logger = self.logger self.tr.start = 101 self.tr.end = 10000 self.tr.add_exons([(101, 300), (501, 800), (1001, 1200), (1301, 2000), (3501, 5000), (5501, 6000), (6201, 7000), (7301, 7700), (8201, 9000), (9101, 9300), (9501, 9700), (9801, 10000)]) self.tr.id = "test1" self.tr.parent = "test1.gene" def test_end_positive(self): self.tr.strand = "+" cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 9130) self.assertEqual(self.tr.end_distance_from_junction, (9300 - 9131 + 1) + (9700 - 9501 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9690) # 190 % 3 == 1 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9690) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (9700 - 9691 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9700 - 9691 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9700), # 200 % 3 == 2 (9801, 9820), # 20 % 2 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9820) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 180) self.assertEqual(self.tr.end_distance_from_junction, 0) def test_end_negative(self): self.tr.strand = "-" # self.tr.add_exons([(101, 300), # (501, 800), # (1001, 1200), # (1301, 2000), # (3501, 5000), # (5501, 6000), # (6201, 7000), # (7301, 7700), # (8201, 9000), # (9101, 9300), # (9501, 9700), # (9801, 10000)]) cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0) self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertTrue(self.tr.is_coding) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 1161) self.assertEqual(self.tr.end_distance_from_junction, (1161-1001) + (800-501+1), (self.tr.end_distance_from_junction, (1161-1001) + (800-501+1)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(721, 800), (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 200 % 3 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 721) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (721-501), (self.tr.end_distance_from_junction, (721-501)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(161, 300), # 140 % 3 == 2 (501, 800), # 300 % 3 == 0 (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 30 % 3 == 0 ] self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0) self.tr.logger = self.logger self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 161) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 60) self.assertEqual(self.tr.end_distance_from_junction, 0)
class MonoHolderTester(unittest.TestCase): logger = create_default_logger("MonoHolderTester") def setUp(self): self.conf = dict() self.t1 = Transcript() self.t1.chrom = "Chr1" self.t1.strand = "+" self.t1.score = 20 self.t1.id = "G1.1" self.t1.parent = "G1" self.t1.start = 101 self.t1.end = 1500 self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)], "exon") self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") self.t1.finalize() def testCdsOverlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_intronMatch(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1510)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_intronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)]) t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noIntronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1800, 2000)]) t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_no_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1600 t2.end = 2000 t2.add_exons([(1600, 1700), (1801, 2000)]) t2.add_exons([(1661, 1700), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_same_id(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G1.1" t2.parent = "G1" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() # This fails because they have the same ID self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))