def test_noncoding(self): transcript = Transcript() transcript.chrom = "Chr1" transcript.source = "test" transcript.start = 10000 transcript.end = 20000 exons = [(10000, 11500), (12000, 13000), (15000, 18000), (19000, 20000)] transcript.add_exons(exons) transcript.strand = "+" transcript.finalize() logger = Mikado.utilities.log_utils.create_null_logger("correct_cds2") copied = transcript.deepcopy() trimmed = trim_noncoding(copied, max_length=50) self.assertEqual(trimmed.start, 11450) self.assertEqual(trimmed.end, 19050) copied = transcript.deepcopy() trimmed = trim_noncoding(copied, max_length=200) self.assertEqual(trimmed.start, 11300) self.assertEqual(trimmed.end, 19200)
class ExternalTester(unittest.TestCase): def setUp(self): self.transcript = Transcript() self.transcript.chrom = "15" self.transcript.source = "protein_coding" self.transcript.start = 47631264 self.transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] self.transcript.strand = "+" self.transcript.add_exons(exons) self.transcript.id = "ENST00000560636" self.transcript.parent = "ENSG00000137872" def test_copying(self): self.transcript.external_scores.update({"test": 0, "test1": 1}) self.assertEqual(self.transcript.external_scores.test, 0) self.assertEqual(self.transcript.external_scores.test1, 1) transcript = self.transcript.deepcopy() self.assertEqual(transcript.external_scores.test, 0) self.assertEqual(transcript.external_scores.test1, 1)
def test_correct_cds(self): transcript = Transcript() transcript.chrom = "Chr1" transcript.source = "test" transcript.start = 10000 transcript.end = 20000 exons = [(10000, 11500), (12000, 13000), (15000, 18000), (19000, 20000)] cds = [ (11400, 11500), # 101 (12000, 13000), # 1001 ==> 1102 (15000, 17998) ] # 2998 == > 3090 (y) transcript.add_exons(exons) transcript.add_exons(cds, features="CDS") transcript.strand = "+" transcript.finalize() logger = Mikado.utilities.log_utils.create_null_logger("correct_cds") copied = transcript.deepcopy() trimmed = trim_coding(copied, logger, max_length=50) self.assertEqual(trimmed.start, 11400) self.assertEqual(trimmed.end, 19050) copied = transcript.deepcopy() self.assertEqual(copied.start, 10000) trimmed = trim_coding(copied, logger, max_length=200) self.assertEqual(trimmed.start, 11300) self.assertEqual(trimmed.end, 19200)
class TranscriptTester(unittest.TestCase): tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . . . ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1 TAIR10 exon 5928 8737 . . . Parent=AT1G01020.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) logger = create_null_logger("null") def setUp(self): """Basic creation test.""" self.tr = Transcript() self.tr.logger = self.logger self.tr.chrom = "Chr1" self.tr.source = "TAIR10" self.tr.feature = "mRNA" self.tr.start = 5928 self.tr.end = 8737 self.tr.strand = "+" self.tr.add_exon((5928, 8737)) self.tr.score = None self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1" self.tr.add_exon((8571, 8666), "CDS") self.tr.finalize() self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = 8571 - 5928 + 1 self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid, self.orf.invalid_reason) self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0) def test_invalid_inizialization(self): with self.assertRaises(TypeError): _ = Mikado.loci.Transcript(self.tr_gff_lines[1]) def test_basics(self): self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.exon_num, 1) self.assertEqual(self.tr.monoexonic, True) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons) def test_cds(self): """Test the CDS features. Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False. """ self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8571) self.assertEqual(self.tr.selected_cds_end, 8666) self.assertEqual(self.tr.has_start_codon, True) self.assertEqual(self.tr.has_stop_codon, True) def test_equality(self): new_transcript = self.tr.deepcopy() self.assertTrue(new_transcript == self.tr) new_transcript.strand = None self.assertFalse(new_transcript == self.tr) # They have now a different strand new_transcript.unfinalize() new_transcript.strand = "+" # It becomes a multiexonic transcript, so it must have a strand new_transcript.end = 9737 new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1]) new_exon.strand = "+" new_exon.start = 9000 new_exon.end = 9737 new_transcript.add_exon(new_exon) new_transcript.finalize() self.assertTrue(new_transcript != self.tr) def test_mono_finalising(self): transcript_line = [line for line in self.tr_gff_lines if line.feature == "mRNA" ] self.assertEqual(len(transcript_line), 1, "\n".join([str(line) for line in self.tr_gff_lines])) tr = Mikado.loci.Transcript(transcript_line[0]) exon_lines = [line for line in self.tr_gff_lines if line.is_exon is True and "UTR" not in line.feature.upper()] tr.add_exons(exon_lines) tr.add_exon((8571, 8666), "CDS") tr.finalize() self.assertGreater(tr.three_utr_length, 0) self.assertGreater(tr.five_utr_length, 0) def test_invalid_transcript(self): lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1""" gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n")] self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine) checker = False if gff_lines[0].feature.endswith("transcript") or "RNA" in gff_lines[0].feature.upper(): checker = True self.assertTrue(checker) self.assertTrue(gff_lines[0].is_transcript) transcript = Mikado.loci.Transcript(gff_lines[0]) transcript.logger = self.logger transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): Mikado.transcripts.transcript_methods.finalizing._check_cdna_vs_utr(transcript) def test_utr(self): self.assertEqual(self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])), ("exon", tuple([5928, 8737])), ("CDS", tuple([8571, 8666]), 0), ("UTR", tuple([8667, 8737]))], "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)], self.tr.selected_internal_orf)) self.assertEqual(self.tr.combined_utr, [tuple([5928, 8570]), tuple([8667, 8737])]) self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])], self.tr.five_utr) self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.three_utr_num, 1) self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928) self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_tes, 8737 - 8666, (self.tr.selected_end_distance_from_tes, self.tr.strand)) def test_strip_cds(self): self.tr.strip_cds() self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc. """ self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_negative_orf(self): """Test loading a negative strand ORF onto a monoexonic transcript. This should reverse the ORF.""" self.orf.strand = "-" self.tr.strip_cds(strand_specific=False) self.orf.has_stop_codon = False self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 8571) def test_introns(self): self.assertEqual(self.tr.introns, set([ ]), self.tr.introns ) self.assertEqual(self.tr.combined_cds_introns, set([ ]), self.tr.combined_cds_introns ) self.assertEqual(self.tr.selected_cds_introns, set([ ]), self.tr.selected_cds_introns ) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds( first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.logger = self.logger self.tr.load_orfs([first_orf]) self.tr.load_orfs([second_orf]) self.tr.load_orfs([third_orf]) self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual(self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0) def testDoubleOrf_negative(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds(strand_specific=False) self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "-" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "-" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, # second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "-" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2) self.tr.logger = self.logger self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual(self.tr.number_internal_orfs, 2, ( self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) # self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertEqual(new_transcripts[1].three_utr_length, 0) def test_wrong_orf(self): # This should be added orf = Mikado.parsers.bed12.BED12() orf.chrom = self.tr.id orf.start = 1 orf.end = self.tr.cdna_length + 1 orf.name = "third" orf.strand = "-" orf.score = 0 orf.thick_start = 501 orf.thick_end = 800 orf.block_count = 1 orf.blockSize = self.tr.cdna_length orf.block_sizes = [self.tr.cdna_length] orf.block_starts = [0] orf.rgb = 0 orf.has_start_codon = True orf.has_stop_codon = True orf.transcriptomic = True self.assertFalse(orf.invalid) self.tr.logger = self.logger self.tr.strip_cds() self.tr.strand = "+" self.logger.setLevel("WARNING") # self.tr.load_orfs([orf]) with self.assertLogs("null", level="DEBUG") as cm_out: self.tr.load_orfs([orf]) self.assertFalse(self.tr.is_coding)
class TranscriptTester(unittest.TestCase): tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . . . ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1 TAIR10 exon 5928 8737 . . . Parent=AT1G01020.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) logger = create_null_logger("null") def setUp(self): """Basic creation test.""" self.tr = Transcript() self.tr.logger = self.logger self.tr.chrom = "Chr1" self.tr.source = "TAIR10" self.tr.feature = "mRNA" self.tr.start = 5928 self.tr.end = 8737 self.tr.strand = "+" self.tr.add_exon((5928, 8737)) self.tr.score = None self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1" self.tr.add_exon((8571, 8666), "CDS") self.tr.finalize() self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = 8571 - 5928 + 1 self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid, self.orf.invalid_reason) self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0) def test_invalid_inizialization(self): with self.assertRaises(TypeError): _ = Mikado.loci.Transcript(self.tr_gff_lines[1]) def test_basics(self): self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.exon_num, 1) self.assertEqual(self.tr.monoexonic, True) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons) def test_cds(self): """Test the CDS features. Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False. """ self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8571) self.assertEqual(self.tr.selected_cds_end, 8666) self.assertEqual(self.tr.has_start_codon, True) self.assertEqual(self.tr.has_stop_codon, True) def test_equality(self): new_transcript = self.tr.deepcopy() self.assertTrue(new_transcript == self.tr) new_transcript.strand = None self.assertFalse( new_transcript == self.tr) # They have now a different strand new_transcript.unfinalize() new_transcript.strand = "+" # It becomes a multiexonic transcript, so it must have a strand new_transcript.end = 9737 new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1]) new_exon.strand = "+" new_exon.start = 9000 new_exon.end = 9737 new_transcript.add_exon(new_exon) new_transcript.finalize() self.assertTrue(new_transcript != self.tr) def test_mono_finalising(self): transcript_line = [ line for line in self.tr_gff_lines if line.feature == "mRNA" ] self.assertEqual(len(transcript_line), 1, "\n".join([str(line) for line in self.tr_gff_lines])) tr = Mikado.loci.Transcript(transcript_line[0]) exon_lines = [ line for line in self.tr_gff_lines if line.is_exon is True and "UTR" not in line.feature.upper() ] tr.add_exons(exon_lines) tr.add_exon((8571, 8666), "CDS") tr.finalize() self.assertGreater(tr.three_utr_length, 0) self.assertGreater(tr.five_utr_length, 0) def test_invalid_transcript(self): lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1""" gff_lines = [ Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n") ] self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine) checker = False if gff_lines[0].feature.endswith( "transcript") or "RNA" in gff_lines[0].feature.upper(): checker = True self.assertTrue(checker) self.assertTrue(gff_lines[0].is_transcript) transcript = Mikado.loci.Transcript(gff_lines[0]) transcript.logger = self.logger transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr( transcript) def test_utr(self): self.assertEqual( self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])), ("exon", tuple([5928, 8737])), ("CDS", tuple([8571, 8666]), 0), ("UTR", tuple([8667, 8737]))], "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)], self.tr.selected_internal_orf)) self.assertEqual( self.tr.combined_utr, [tuple([5928, 8570]), tuple([8667, 8737])]) self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])], self.tr.five_utr) self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.three_utr_num, 1) self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928) self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928, self.tr.selected_end_distance_from_tes) self.assertEqual( self.tr.selected_end_distance_from_tes, 8737 - 8666, (self.tr.selected_end_distance_from_tes, self.tr.strand)) def test_strip_cds(self): self.tr.strip_cds() self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc. """ self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_negative_orf(self): """Test loading a negative strand ORF onto a monoexonic transcript. This should reverse the ORF.""" self.orf.strand = "-" self.tr.strip_cds(strand_specific=False) self.orf.has_stop_codon = False self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 8571) def test_introns(self): self.assertEqual(self.tr.introns, set([]), self.tr.introns) self.assertEqual(self.tr.combined_cds_introns, set([]), self.tr.combined_cds_introns) self.assertEqual(self.tr.selected_cds_introns, set([]), self.tr.selected_cds_introns) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue( Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.logger = self.logger self.tr.load_orfs([first_orf]) self.tr.load_orfs([second_orf]) self.tr.load_orfs([third_orf]) self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0) def testDoubleOrf_negative(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds(strand_specific=False) self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "-" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "-" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, # second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "-" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2) self.tr.logger = self.logger self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) # self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertEqual(new_transcripts[1].three_utr_length, 0) def test_wrong_orf(self): # This should be added orf = Mikado.parsers.bed12.BED12() orf.chrom = self.tr.id orf.start = 1 orf.end = self.tr.cdna_length + 1 orf.name = "third" orf.strand = "-" orf.score = 0 orf.thick_start = 501 orf.thick_end = 800 orf.block_count = 1 orf.blockSize = self.tr.cdna_length orf.block_sizes = [self.tr.cdna_length] orf.block_starts = [0] orf.rgb = 0 orf.has_start_codon = True orf.has_stop_codon = True orf.transcriptomic = True self.assertFalse(orf.invalid) self.tr.logger = self.logger self.tr.strip_cds() self.tr.strand = "+" self.logger.setLevel("WARNING") # self.tr.load_orfs([orf]) with self.assertLogs("null", level="DEBUG") as cm_out: self.tr.load_orfs([orf]) self.assertFalse(self.tr.is_coding)
class TestPadding(unittest.TestCase): @classmethod def setUpClass(cls): cls.fai = pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz") def setUp(self): self.reference = "Chr5\t26574999\t26578625\tID=AT5G66600.3;coding=True;phase=0\t0\t-\t26575104\t26578315\t0\t11\t411,126,87,60,100,809,126,72,82,188,107\t0,495,711,885,1035,1261,2163,2378,2856,3239,3519" self.reference = Transcript(BED12(self.reference), source="TAIR10", is_reference=True) def test_basic_padding(self): logger = create_null_logger("test_basic_padding") logger.setLevel("INFO") template = self.reference.copy() template.id = "AT5G66600.3_exp" template.strip_cds() template.unfinalize() template.remove_exon((26575000, 26575410)) # First exon template.start = 26574650 template.add_exon((26574970, 26575410)) # New exon, template at 5' template.add_exon((26574650, 26574820)) # New UTR exon template.remove_exon((26578519, 26578625)) # Last exon template.end = 26579700 template.add_exon((26578519, 26578725)) template.add_exon((26579325, 26579700)) template.finalize() fai = pysam.FastaFile( pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) new5 = pad_transcript(self.reference, self.reference.deepcopy(), None, template, fai, logger) self.assertIn((26574970, 26575410), new5.exons) self.assertIn((26574650, 26574820), new5.exons) self.assertEqual(template.start, new5.start) self.assertEqual(self.reference.end, new5.end) new3 = pad_transcript(self.reference, self.reference.deepcopy(), template, None, fai, logger) self.assertIn((26578519, 26578725), new3.exons) self.assertIn((26579325, 26579700), new3.exons) self.assertEqual(self.reference.start, new3.start) self.assertEqual(template.end, new5.end) new53 = pad_transcript(self.reference, self.reference.deepcopy(), template, template, fai, logger) self.assertIn((26574970, 26575410), new53.exons) self.assertIn((26574650, 26574820), new53.exons) self.assertIn((26578519, 26578725), new53.exons) self.assertIn((26579325, 26579700), new53.exons) self.assertEqual(template.start, new53.start) self.assertEqual(template.end, new53.end) def test_locus_padding_equal_or_n(self): for num, exons_to_add in enumerate([ ((26574970, 26575410), (26578519, 26578725)), ((26574970, 26575410), (26574650, 26574820), (26578519, 26578725), (26579325, 26579700)) ]): for num2, pad_transcripts in enumerate((False, True)): with self.subTest(exons_to_add=exons_to_add, pad_transcripts=pad_transcripts): logger = create_null_logger( "test_locus_padding_equal_or_n_" + str(num + num2 * 2)) logger.setLevel("DEBUG") template = self.reference.copy() del template.is_reference template.id = "AT5G66600.3_exp" template.unfinalize() template.remove_exon((26575000, 26575410)) # First exon template.remove_exon((26578519, 26578625)) # Last exon template.start = min([_[0] for _ in exons_to_add]) template.end = max([_[1] for _ in exons_to_add]) template.add_exons( exons_to_add) # New exon, template at 5' template.finalize() json_conf = load_and_validate_config(None) json_conf.reference.genome = self.fai json_conf.pick.alternative_splicing.only_confirmed_introns = False json_conf.pick.run_options.only_reference_update = True json_conf.pick.alternative_splicing.pad = pad_transcripts locus = Locus(self.reference.copy(), logger=logger, configuration=json_conf) self.assertTrue(locus[self.reference.id].is_reference) self.assertEqual(locus.perform_padding, pad_transcripts) locus.add_transcript_to_locus(template) if pad_transcripts is True: self.assertIn(template.id, locus) locus.finalize_alternative_splicing() self.assertNotIn(template.id, locus) if pad_transcripts is False: self.assertEqual(locus[self.reference.id].start, self.reference.start) self.assertEqual(locus[self.reference.id].end, self.reference.end) else: self.assertTrue(locus.perform_padding) self.assertEqual(locus[self.reference.id].start, template.start, (locus[self.reference.id].exons[0], template.exons[0])) self.assertEqual( locus[self.reference.id].end, template.end, (locus[self.reference.id].end, template.end)) self.assertNotIn(template.id, locus) def test_removal_after_padding(self): """Here we test that, given three transcripts, the first one will be expanded to be identical to the second; the second will be removed as redundant; the third will be expanded to compatible with the padded first. """ logger = create_default_logger("test_add_two_partials", "INFO") json_conf = load_and_validate_config(None) json_conf.reference.genome = self.fai json_conf.pick.alternative_splicing.min_cds_overlap = 0.2 json_conf.pick.alternative_splicing.min_cdna_overlap = 0.2 json_conf.pick.alternative_splicing.only_confirmed_introns = False json_conf.pick.alternative_splicing.keep_retained_introns = True json_conf.pick.alternative_splicing.pad = True json_conf.scoring.requirements.expression = ["cdna_length"] json_conf.scoring.requirements.parameters = { "cdna_length": SizeFilter(operator="gt", value=0) } json_conf.scoring.requirements._expression = json_conf.scoring.requirements._create_expression( json_conf.scoring.requirements.expression, json_conf.scoring.requirements.parameters) json_conf.scoring.as_requirements.expression = ["cdna_length"] json_conf.scoring.as_requirements.parameters = { "cdna_length": SizeFilter(operator="gt", value=0) } json_conf.scoring.as_requirements._expression = json_conf.scoring.requirements._create_expression( json_conf.scoring.requirements.expression, json_conf.scoring.requirements.parameters) t1 = Transcript( BED12( "Chr5\t26584779\t26587869\tID=AT5G66610.1;coding=True;phase=0\t0\t+\t26585222\t26587755\t0\t11\t\ 100,54,545,121,78,105,213,63,119,59,443\t0,440,565,1202,1437,1640,1858,2154,2304,2507,2647" )) t2_1 = Transcript( BED12( "Chr5\t26584773\t26586510\tID=AT5G66610.2_1;coding=True;phase=0\t0\t+\t26585222\t26586510\t0\t\ 6\t177,54,545,121,78,85\t0,446,571,1208,1443,1652")) t2_2 = Transcript( BED12( "Chr5\t26584873\t26587782\tID=AT5G66610.2_2;coding=True;phase=0\t0\t+\t26585222\t\ 26587755\t0\t10\t77,54,545,121,78,99,213,63,119,496\t0,346,471,1108,1343,1552,1764,2060,2210,2413" )) t1.finalize() t2_1.finalize() t2_2.finalize() t1.is_reference = True self.assertEqual(t1.start, 26584780) locus = Locus(t1, logger=logger, configuration=json_conf) locus.add_transcript_to_locus(t2_1, check_in_locus=False) locus.add_transcript_to_locus(t2_2, check_in_locus=False) self.assertTrue(locus.primary_transcript_id == t1.id) locus.logger.setLevel("INFO") locus.finalize_alternative_splicing(_scores={ t1.id: 20, t2_1.id: 15, t2_2.id: 10 }) self.assertIn(t1.id, locus.transcripts) if t2_1.id in locus.transcripts: for tid1, tid2 in itertools.combinations(locus.transcripts.keys(), 2): res, _ = Assigner.compare(locus[tid1], locus[tid2]) print(tid1, tid2, res.ccode) self.assertNotIn(t2_1.id, locus.transcripts.keys(), [(key, val.start, val.end) for key, val in locus.transcripts.items()]) self.assertIn(t2_2.id, locus.transcripts, "\n".join(tr.format("bed12") for tr in locus)) self.assertTrue(locus[t2_2.id].attributes["padded"]) # self.assertTrue(locus[t1.id].attributes["padded"]) self.assertGreaterEqual(t1.start, locus[t1.id].start, locus[t1.id].format("bed12")) self.assertEqual( locus[t2_2.id].start, locus[t1.id].start, ((locus[t2_2.id].start, t1.start, t2_1.start, t2_2.start), (locus[t2_2.id].end, t1.end, t2_1.end, t2_2.end)), ) self.assertEqual(locus[t1.id].end, locus[t2_2.id].end) def test_add_two_partials(self): logger = create_null_logger("test_add_two_partials") logger.setLevel("INFO") json_conf = load_and_validate_config(None) json_conf.reference.genome = self.fai json_conf.pick.alternative_splicing.only_confirmed_introns = False json_conf.pick.run_options.only_reference_update = True ref = Transcript(is_reference=True) ref.chrom, ref.strand, ref.id = "Chr5", "-", "AT5G66670.2" ref.add_exons([(26611258, 26612889)]) ref.add_exons([(26611474, 26612700)], features=["CDS"]) ref.finalize() self.assertTrue(ref.is_coding) # Chr5 TAIR10 mRNA 26611258 26612889 . - . ID=AT5G66670.2;Parent=AT5G66670;Name=AT5G66670.2;index=1 # Chr5 TAIR10 protein 26611474 26612700 . - . ID=AT5G66670.2-Protein;Parent=AT5G66670.2;Name=AT5G66670.2;derives_from=AT5G66670.2 # Chr5 TAIR10 three_prime_UTR 26611258 26611473 . - . Parent=AT5G66670.2 # Chr5 TAIR10 CDS 26611474 26612700 . - 0 Parent=AT5G66670.2 # Chr5 TAIR10 five_prime_UTR 26612701 26612889 . - . Parent=AT5G66670.2 # Chr5 TAIR10 exon 26611258 26612889 . - . Parent=AT5G66670.2 template1 = Transcript(is_reference=False) template1.chrom, template1.strand, template1.id = ref.chrom, ref.strand, ref.id + "_frag1" template1.add_exons(((26611116, 26611157), (26611258, 26612670))) template1.add_exons(((26611474, 26612670), ), features=["CDS"]) template1.finalize() self.assertTrue(template1.is_coding) template2 = Transcript(is_reference=False) template2.chrom, template2.strand, template2.id = ref.chrom, ref.strand, ref.id + "_frag2" template2.add_exons(((26611574, 26612889), (26613007, 26613403))) template2.add_exons(((26611574, 26612700), ), features=["CDS"]) template2.finalize() self.assertTrue(template2.is_coding) logger.setLevel("INFO") json_conf.pick.alternative_splicing.pad = True locus = Locus(ref, configuration=json_conf, logger=logger) locus.add_transcript_to_locus(template1) locus.add_transcript_to_locus(template2) self.assertIn(template2.id, locus) # self.assertIn(template1.id, locus) # locus.logger.setLevel("DEBUG") # for tid in locus: # locus[tid].logger.setLevel("DEBUG") locus.finalize_alternative_splicing(check_requirements=False) self.assertTrue(locus._finalized) self.assertNotIn(template1.id, locus, "\n" + str(locus)) self.assertNotIn(template2.id, locus, "\n" + str(locus)) self.assertEqual( locus[ref.id].end, template2.end, ((locus[ref.id].end, ref.end, template2.end, template1.end), (locus[ref.id].start, ref.start, template2.start, template1.start))) @unittest.skip def test_failed_expansion(self): logger = create_default_logger("test_failed_expansion", level="WARNING") raw = [ Transcript(line, logger=logger) for line in Bed12Parser( open( pkg_resources.resource_filename( "Mikado.tests", os.path.join("test_pick_pad", "fail.bed12")))) ] transcripts = dict((_.id, _) for _ in raw) [_.finalize() for _ in transcripts.values()] template = transcripts["template"] # 4535908 4540293 candidate = transcripts["candidate"] # 4536444 4540027 backup = candidate.copy() fai = pysam.FastaFile( pkg_resources.resource_filename( "Mikado.tests", os.path.join("test_pick_pad", "failing_seq.fa.gz"))) logger.setLevel("DEBUG") candidate.logger.setLevel("DEBUG") pad_transcript(candidate, backup, start_transcript=template, end_transcript=template, fai=fai, logger=logger)