def test_index(self): # Create the list of files files = ["trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3"] # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] namespace = Namespace(default=False) namespace.distance = 2000 namespace.index = True namespace.prediction = None namespace.log = os.path.join(tempfile.gettempdir(), "index.log") logger = create_null_logger("null") for ref in files: with self.subTest(ref=ref): temp_ref = os.path.join(tempfile.gettempdir(), ref) with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\ open(temp_ref, "wb") as out_handle: out_handle.write(ref_handle.read()) namespace.reference = to_gff(temp_ref) compare(namespace) self.assertTrue(os.path.exists(namespace.log)) self.assertTrue(os.path.exists("{}.midx".format(namespace.reference.name))) self.assertGreater(os.stat("{}.midx".format(namespace.reference.name)).st_size, 0) genes, positions = load_index(namespace, logger) self.assertIsInstance(genes, dict) self.assertIsInstance(positions, dict) self.assertEqual(len(genes), 38) os.remove(namespace.reference.name) os.remove(namespace.log) os.remove("{}.midx".format(namespace.reference.name))
class DrosoTester(unittest.TestCase): logger = create_null_logger("droso") def setUp(self): ref_gtf = """2L\tprotein_coding\ttranscript\t523736\t540560\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:5"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t523736\t524059\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:5"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t525392\t525436\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "2"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:677"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t536023\t536966\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "3"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:7"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t537037\t537431\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "4"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:8"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t537549\t537749\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "5"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:9"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t537863\t539249\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "6"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:10"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t539310\t539452\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "7"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:11"; gene_biotype "protein_coding"; 2L\tprotein_coding\texon\t539518\t540560\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "8"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:13"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t524038\t524059\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t525392\t525436\t.\t+\t2\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "2"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t536023\t536966\t.\t+\t2\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "3"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t537037\t537431\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "4"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t537549\t537749\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "5"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t537863\t539249\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "6"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t539310\t539452\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "7"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; 2L\tprotein_coding\tCDS\t539518\t540016\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "8"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding"; """ pred_gtf = """2L\tStringTie\ttranscript\t476445\t479670\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; cov "141.769424"; FPKM "inf"; 2L\tStringTie\texon\t476445\t478204\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; exon_number "1"; cov "149.294586"; 2L\tStringTie\texon\t479407\t479670\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; exon_number "2"; cov "91.601692";""" ref_lines = [ parsers.GTF.GtfLine(line) for line in filter(lambda x: x != '', ref_gtf.split("\n")) ] self.ref = loci.Transcript(ref_lines[0]) self.ref.logger = self.logger for l in ref_lines[1:]: self.ref.add_exon(l) self.ref.finalize() pred_lines = [ parsers.GTF.GtfLine(line) for line in filter(lambda x: x != '', pred_gtf.split("\n")) ] self.pred = loci.Transcript(pred_lines[0]) for l in pred_lines[1:]: self.pred.add_exon(l) self.pred.finalize() def test_code(self): self.ref.finalize() self.assertGreater(len(self.ref.combined_cds), 0) self.assertEqual(len(self.ref.selected_cds_introns), 7) self.assertEqual(len(self.ref.combined_cds_introns), 7)
def test_locus(self): """Basic testing of the Locus functionality.""" logger = create_null_logger("null") logger.setLevel("WARNING") logger.info("Started") slocus = Superlocus(self.transcript1, json_conf=self.my_json, logger=logger) slocus.add_transcript_to_locus(self.transcript2) self.assertEqual(slocus.strand, self.transcript1.strand) self.assertEqual(slocus.start, min(self.transcript1.start, self.transcript2.start)) self.assertEqual(slocus.end, max(self.transcript1.end, self.transcript2.end)) logger.info(slocus.transcripts) slocus.define_subloci() logger.info(slocus.subloci) logger.info(slocus.transcripts) self.assertEqual(len(slocus.transcripts), 2) self.assertEqual(len(slocus.subloci), 2) slocus.define_monosubloci() self.assertEqual(len(slocus.monosubloci), 2) slocus.define_loci() self.assertEqual(len(slocus.loci), 1) self.assertEqual( list(slocus.loci[list( slocus.loci.keys())[0]].transcripts.keys())[0], "t0") gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0 Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split( "\n") gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3] transcript3 = Transcript(gff_transcript3[0]) for exon in gff_transcript3[1:]: transcript3.add_exon(exon) transcript3.finalize() minusuperlocus = Superlocus(transcript3, json_conf=self.my_json) minusuperlocus.define_loci() self.assertEqual(len(minusuperlocus.loci), 1) self.assertTrue(transcript3.strand != self.transcript1.strand)
def test_index(self): # Create the list of files files = [ "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3" ] # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] namespace = Namespace(default=False) namespace.distance = 2000 namespace.index = True namespace.prediction = None namespace.log = os.path.join(tempfile.gettempdir(), "index.log") logger = create_null_logger("null") for ref in files: with self.subTest(ref=ref): temp_ref = os.path.join(tempfile.gettempdir(), ref) with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\ open(temp_ref, "wb") as out_handle: out_handle.write(ref_handle.read()) namespace.reference = to_gff(temp_ref) compare(namespace) self.assertTrue(os.path.exists(namespace.log)) self.assertTrue( os.path.exists("{}.midx".format(namespace.reference.name))) self.assertGreater( os.stat("{}.midx".format( namespace.reference.name)).st_size, 0) genes, positions = load_index(namespace, logger) self.assertIsInstance(genes, dict) self.assertIsInstance(positions, dict) self.assertEqual(len(genes), 38) os.remove(namespace.reference.name) os.remove(namespace.log) os.remove("{}.midx".format(namespace.reference.name))
def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = "first" first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue( loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(Mikado.py.loci.transcript.Transcript.find_overlapping_cds(candidates)), 2) logger = create_null_logger("null") self.tr.logger = logger self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0)
class TranscriptTesterPositive(unittest.TestCase): logger = create_null_logger("test_at") tr_gff = """Chr2 TAIR10 mRNA 626642 629176 . + . ID=AT2G02380.1;Parent=AT2G02380 Chr2 TAIR10 exon 626642 626780 . + . Parent=AT2G02380.1 Chr2 TAIR10 five_prime_UTR 626642 626780 . + . Parent=AT2G02380.1 Chr2 TAIR10 exon 626842 626880 . + . Parent=AT2G02380.1 Chr2 TAIR10 five_prime_UTR 626842 626877 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626878 626880 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 626963 627059 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626963 627059 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627137 627193 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627137 627193 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627312 627397 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627312 627397 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627488 627559 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627488 627559 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627696 627749 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627696 627749 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627840 627915 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627840 627915 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628044 628105 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628044 628105 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 628182 628241 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628182 628241 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628465 628676 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628465 628569 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 three_prime_UTR 628570 628676 . + . Parent=AT2G02380.1 Chr2 TAIR10 exon 629070 629176 . + . Parent=AT2G02380.1 Chr2 TAIR10 three_prime_UTR 629070 629176 . + . Parent=AT2G02380.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub(r"\s+", r"\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False def setUp(self): """Basic creation test.""" self.tr = loci.Transcript(self.tr_gff_lines[0]) for line in self.tr_gff_lines[1:]: self.tr.add_exon(line) self.tr.finalize() self.tr.logger = self.logger self.orf = parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = self.tr.selected_start_distance_from_tss + 1 self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True def test_basics(self): self.assertEqual(self.tr.chrom, "Chr2") self.assertEqual(self.tr.strand, "+") self.assertEqual(self.tr.exon_num, 12) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 626642) self.assertEqual(self.tr.end, 629176) exons = [(626642, 626780), (626842, 626880), (626963, 627059), (627137, 627193), (627312, 627397), (627488, 627559), (627696, 627749), (627840, 627915), (628044, 628105), (628182, 628241), (628465, 628676), (629070, 629176)] self.assertEqual(self.tr.exons, exons, self.tr.exons) def test_no_exons(self): self.tr.finalized = False self.tr.exons = [] self.tr.finalize() self.assertEqual(self.tr.chrom, "Chr2") self.assertEqual(self.tr.strand, "+") self.assertEqual(self.tr.exon_num, 12) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 626642) self.assertEqual(self.tr.end, 629176) exons = [(626642, 626780), (626842, 626880), (626963, 627059), (627137, 627193), (627312, 627397), (627488, 627559), (627696, 627749), (627840, 627915), (628044, 628105), (628182, 628241), (628465, 628676), (629070, 629176)] self.assertEqual(self.tr.exons, exons, self.tr.exons) def test_cds(self): self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) cds = [(626878, 626880), (626963, 627059), (627137, 627193), (627312, 627397), (627488, 627559), (627696, 627749), (627840, 627915), (628044, 628105), (628182, 628241), (628465, 628569)] self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 626878) self.assertEqual(self.tr.selected_cds_end, 628569) def test_secondary_orf(self): self.assertEqual(self.tr.cds_not_maximal, 0) self.assertEqual(self.tr.cds_not_maximal_fraction, 0) def test_utr(self): self.assertEqual(self.tr.five_utr, [(626642, 626780), (626842, 626877)]) self.assertEqual(self.tr.three_utr, [(628570, 628676), (629070, 629176)]) def test_introns(self): introns = {(626781, 626841), (626881, 626962), (627060, 627136), (627194, 627311), (627398, 627487), (627560, 627695), (627750, 627839), (627916, 628043), (628106, 628181), (628242, 628464), (628677, 629069)} self.assertEqual(self.tr.introns, introns, self.tr.introns) introns = {(626881, 626962), (627060, 627136), (627194, 627311), (627398, 627487), (627560, 627695), (627750, 627839), (627916, 628043), (628106, 628181), (628242, 628464)} self.assertEqual( self.tr.combined_cds_introns, introns, (sorted(self.tr.combined_cds_introns), sorted(introns))) cds_introns = {(626881, 626962), (627060, 627136), (627194, 627311), (627398, 627487), (627560, 627695), (627750, 627839), (627916, 628043), (628106, 628181), (628242, 628464)} self.assertEqual(self.tr.selected_cds_introns, cds_introns, self.tr.selected_cds_introns) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 2) self.assertEqual(self.tr.three_utr_num, 2) self.assertEqual(self.tr.five_utr_num_complete, 1) self.assertEqual(self.tr.three_utr_num_complete, 1) self.assertEqual(self.tr.five_utr_length, 626780 + 1 - 626642 + 626877 + 1 - 626842) self.assertEqual(self.tr.three_utr_length, 628676 + 1 - 628570 + 629176 + 1 - 629070) self.assertEqual(self.tr.selected_start_distance_from_tss, 626780 + 1 - 626642 + 626878 - 626842, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_start_distance_from_tss, self.tr.start_distance_from_tss) self.assertEqual(self.tr.selected_end_distance_from_tes, 628676 - 628569 + 629176 + 1 - 629070, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_tes, self.tr.end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_junction, 628676 - 628569) def test_strip_cds(self): with self.assertLogs(logger=self.logger, level="DEBUG") as log_split: self.tr.strip_cds() self.assertIn( "DEBUG:{}:Stripping CDS from AT2G02380.1".format(self.logger.name), log_split.output) self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_with_no_gff_utr(self): """ Test the creation of the transcript without the UTR lines, verify that everything is still alright :return: """ tr_gff = """Chr2 TAIR10 mRNA 626642 629176 . + . ID=AT2G02380.1;Parent=AT2G02380 Chr2 TAIR10 exon 626642 626780 . + . Parent=AT2G02380.1 Chr2 TAIR10 exon 626842 626880 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626878 626880 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 626963 627059 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626963 627059 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627137 627193 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627137 627193 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627312 627397 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627312 627397 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627488 627559 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627488 627559 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627696 627749 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627696 627749 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627840 627915 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627840 627915 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628044 628105 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628044 628105 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 628182 628241 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628182 628241 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628465 628676 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628465 628569 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 629070 629176 . + . Parent=AT2G02380.1""" tr_lines = tr_gff.split("\n") logger = create_default_logger("test") logger.setLevel("INFO") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub(r"\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines] transcript = loci.Transcript(tr_gff_lines[0], logger=logger) for line in tr_gff_lines[1:]: transcript.add_exon(line) self.assertEqual(transcript.exons, self.tr.exons) self.assertNotEqual([], transcript.combined_cds) transcript.finalize() self.assertTrue(transcript.is_coding) self.assertEqual(transcript.five_utr, self.tr.five_utr) self.assertEqual(transcript.three_utr, self.tr.three_utr) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc.""" self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) cds = [(626878, 626880), (626963, 627059), (627137, 627193), (627312, 627397), (627488, 627559), (627696, 627749), (627840, 627915), (628044, 628105), (628182, 628241), (628465, 628569)] self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_load_orf(self): """Test for loading a single ORF. We strip the CDS and reload it.""" with self.assertLogs(logger=self.logger, level="DEBUG") as cm_out: self.tr.strip_cds() self.assertIn("Stripping CDS", cm_out.output[0]) self.tr.load_orfs([self.orf]) cds = [(626878, 626880), (626963, 627059), (627137, 627193), (627312, 627397), (627488, 627559), (627696, 627749), (627840, 627915), (628044, 628105), (628182, 628241), (628465, 628569)] self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 626878) self.assertEqual(self.tr.selected_cds_end, 628569) def test_negative_orf(self): """Test loading a negative strand ORF onto a multiexonic transcript. This should have no effect.""" self.orf.strand = "-" self.tr.strip_cds() self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.selected_cds_start, None) def test_raises_invalid(self): self.tr.finalized = False self.tr.strand = None __current = self.tr.deepcopy() self.assertRaises(exceptions.InvalidTranscript, self.tr.finalize) self.assertFalse(self.tr.finalized) # self.assertTrue(__current is self.tr) self.tr.strand = "+" self.tr.finalize() self.tr.finalized = False self.tr.exons += [(625878, 625880)] self.assertRaises(exceptions.InvalidTranscript, self.tr.finalize) def test_complete(self): self.assertTrue(self.tr.has_stop_codon) self.assertTrue(self.tr.has_start_codon) self.assertTrue(self.tr.is_complete) def test_lengths(self): self.assertEqual(self.tr.cdna_length, 1061) self.assertEqual(self.tr.selected_cds_length, 672) self.assertAlmostEqual(self.tr.combined_cds_fraction, 672 / 1061, delta=0.01) self.assertAlmostEqual(self.tr.selected_cds_fraction, 672 / 1061, delta=0.01) def testSegments(self): self.assertEqual(self.tr.combined_cds_num, 10) self.assertEqual(self.tr.selected_cds_num, 10) self.assertEqual(self.tr.highest_cds_exon_number, 10) self.assertEqual(self.tr.max_intron_length, 393) self.assertEqual(self.tr.number_internal_orfs, 1) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = "first" first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue( loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(Mikado.py.loci.transcript.Transcript.find_overlapping_cds(candidates)), 2) logger = create_null_logger("null") self.tr.logger = logger self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0)
class MonoBaseTester(unittest.TestCase): """ This test verifies the correct ORF loading and splitting in the case where the transcript has multiple ORFs and in one case it starts exactly at the terminal point of a previous exon. """ logger = create_null_logger("null") def setUp(self): self.tr = loci.Transcript() self.tr.chrom = "Chr5" self.tr.start = 22597965 self.tr.end = 22602701 self.tr.strand = "+" self.tr.score = 1000 self.tr.parent = "StringTie_DN.70115" self.tr.id = "StringTie_DN.70115.4" self.tr.source = "StringTie" self.tr.feature = "transcript" self.tr.add_exons([(22597965, 22601782), (22601862, 22601957), (22602039, 22602701)]) self.tr.logger = self.logger # First ORF self.bed1 = parsers.bed12.BED12() self.bed1.chrom = self.tr.id self.bed1.start = 1 self.bed1.end = 4577 self.bed1.name = "{0}.1".format(self.tr.id) self.bed1.strand = "+" self.bed1.score = 0 self.bed1.thick_start = 434 self.bed1.thick_end = 3736 self.bed1.has_start_codon = True self.bed1.transcriptomic = True self.bed1.has_stop_codon = True self.bed1.block_count = 1 self.bed1.block_sizes = [len(self.bed1)] self.bed1.block_starts = [0] # Second ORF self.bed2 = copy.deepcopy(self.bed1) self.bed2.name = "{0}.2".format(self.tr.id) self.bed2.thick_start = 2 self.bed2.thick_end = 388 self.bed2.has_start_codon = False # Third ORF self.bed3 = copy.deepcopy(self.bed1) self.bed3.name = "{0}.3".format(self.tr.id) self.bed3.thick_start = 3914 self.bed3.thick_end = 4393 def test_finalise(self): self.tr.finalize() self.assertTrue(self.tr.finalized) self.assertEqual(self.tr.max_exon_length, 3818) self.assertEqual(self.tr.min_exon_length, 96) self.assertEqual(self.tr.max_intron_length, 81, self.tr.introns) self.assertEqual(self.tr.min_intron_length, 79, self.tr.introns) def test_load_orfs(self): self.assertFalse(self.bed1.invalid) self.assertFalse(self.bed2.invalid) self.assertFalse(self.bed3.invalid) self.assertEqual(self.bed3.cds_len, self.bed3.thick_end - self.bed3.thick_start + 1) self.tr.load_orfs([self.bed1, self.bed2, self.bed3]) self.assertEqual(self.tr.number_internal_orfs, 3) self.assertEqual(self.tr.selected_cds_length, self.bed1.cds_len) def test_split(self): self.tr.load_orfs([self.bed3, self.bed1]) splitted_transcripts = [l for l in self.tr.split_by_cds()] self.assertEqual(len(splitted_transcripts), 2) def test_print(self): self.tr.logger = self.logger self.tr.finalize() self.maxDiff = None real_printed = """Chr5\tStringTie\ttranscript\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115 Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4""" self.assertEqual(str(self.tr.format("gff3")), real_printed) real_printed_gtf = """Chr5\tStringTie\ttranscript\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";""" self.assertEqual(self.tr.__str__(to_gtf=True), real_printed_gtf) pass def test_print_cds(self): self.tr.load_orfs([self.bed1]) self.maxDiff = None # self.bed1.end = 4577 # self.bed1.thick_start = 434 # self.bed1.thick_end = 3736 real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\tfive_prime_UTR\t22597965\t22598397\t.\t+\t.\tID=StringTie_DN.70115.4.five_prime_UTR1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tID=StringTie_DN.70115.4.CDS1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\tthree_prime_UTR\t22601701\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4 Chr5\tStringTie\tthree_prime_UTR\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR2;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4 Chr5\tStringTie\tthree_prime_UTR\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR3;Parent=StringTie_DN.70115.4""" self.assertEqual(str(self.tr), real_printed) real_printed_gtf = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Name "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\t5UTR\t22597965\t22598397\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\t3UTR\t22601701\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\t3UTR\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\t3UTR\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";""" import itertools for lines in itertools.zip_longest( self.tr.__str__(to_gtf=True).split("\n"), real_printed_gtf.split("\n")): self.assertEqual(lines[0], lines[1]) # self.assertEqual(self.tr.__str__(to_gtf=True), # real_printed_gtf.rstrip()) def test_print_multiple_orfs(self): self.maxDiff = None self.tr.load_orfs([self.bed1, self.bed3]) # self.bed1.end = 4577 # self.bed1.thick_start = 434 # self.bed1.thick_end = 3736 # self.bed3.thick_start = 3914 # self.bed3.thick_end = 4393 real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4.orf1;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4;maximal=True Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon1;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tfive_prime_UTR\t22597965\t22598397\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.five_prime_UTR1;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tID=StringTie_DN.70115.4.orf1.CDS1;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tthree_prime_UTR\t22601701\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR1;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon2;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tthree_prime_UTR\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR2;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon3;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tthree_prime_UTR\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR3;Parent=StringTie_DN.70115.4.orf1 Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4.orf2;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4;maximal=False Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon1;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\tfive_prime_UTR\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.five_prime_UTR1;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon2;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\tfive_prime_UTR\t22601862\t22601956\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.five_prime_UTR2;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\tCDS\t22601957\t22601957\t.\t+\t0\tID=StringTie_DN.70115.4.orf2.CDS1;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\tCDS\t22602039\t22602517\t.\t+\t2\tID=StringTie_DN.70115.4.orf2.CDS2;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon3;Parent=StringTie_DN.70115.4.orf2 Chr5\tStringTie\tthree_prime_UTR\t22602518\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.three_prime_UTR1;Parent=StringTie_DN.70115.4.orf2""" self.assertEqual(self.tr.format("gff", all_orfs=True), real_printed) def test_print_without_cds(self): self.maxDiff = None self.tr.load_orfs([self.bed1, self.bed3]) real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115 Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4 Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4""" self.assertEqual(self.tr.format("gff3", with_cds=False), real_printed) real_printed_gtf = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";""" self.assertEqual(self.tr.format("gtf", with_cds=False), real_printed_gtf)
class TranscriptTesterNegative(unittest.TestCase): logger = create_null_logger("null") logger.setLevel(logging.WARNING) tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . - . ID=AT1G01020.1;Parent=AT1G01020 Chr1 TAIR10 five_prime_UTR 8667 8737 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 8571 8666 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 8571 8737 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 8417 8464 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 8417 8464 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 8236 8325 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 8236 8325 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 7942 7987 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 7942 7987 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 7762 7835 . - 2 Parent=AT1G01020.1; Chr1 TAIR10 exon 7762 7835 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 7564 7649 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 7564 7649 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 7384 7450 . - 1 Parent=AT1G01020.1; Chr1 TAIR10 exon 7384 7450 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 7157 7232 . - 0 Parent=AT1G01020.1; Chr1 TAIR10 exon 7157 7232 . - . Parent=AT1G01020.1 Chr1 TAIR10 CDS 6915 7069 . - 2 Parent=AT1G01020.1; Chr1 TAIR10 three_prime_UTR 6437 6914 . - . Parent=AT1G01020.1 Chr1 TAIR10 exon 6437 7069 . - . Parent=AT1G01020.1 Chr1 TAIR10 three_prime_UTR 5928 6263 . - . Parent=AT1G01020.1 Chr1 TAIR10 exon 5928 6263 . - . Parent=AT1G01020.1""" tr_lines = [line for line in tr_gff.split("\n") if line] for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) def setUp(self): """Basic creation test.""" self.tr = Mikado.loci.Transcript(self.tr_gff_lines[0], logger=self.logger) for line in self.tr_gff_lines[1:]: self.tr.add_exon(line) self.tr.name = self.tr.id self.tr.finalize() self.tr.logger = self.logger self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = self.tr.selected_start_distance_from_tss + 1 self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid) self.assertEqual(len(self.tr), self.tr.end - self.tr.start + 1) def test_print(self): self.maxDiff = None real_printed = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1 Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.exon1;Parent=AT1G01020.1 Chr1\tTAIR10\tthree_prime_UTR\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.three_prime_UTR1;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tID=AT1G01020.1.exon2;Parent=AT1G01020.1 Chr1\tTAIR10\tthree_prime_UTR\t6437\t6914\t.\t-\t.\tID=AT1G01020.1.three_prime_UTR2;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t6915\t7069\t.\t-\t2\tID=AT1G01020.1.CDS1;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t7157\t7232\t.\t-\t0\tID=AT1G01020.1.CDS2;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tID=AT1G01020.1.exon3;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t7384\t7450\t.\t-\t1\tID=AT1G01020.1.CDS3;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tID=AT1G01020.1.exon4;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t7564\t7649\t.\t-\t0\tID=AT1G01020.1.CDS4;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tID=AT1G01020.1.exon5;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t7762\t7835\t.\t-\t2\tID=AT1G01020.1.CDS5;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tID=AT1G01020.1.exon6;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t7942\t7987\t.\t-\t0\tID=AT1G01020.1.CDS6;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tID=AT1G01020.1.exon7;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t8236\t8325\t.\t-\t0\tID=AT1G01020.1.CDS7;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tID=AT1G01020.1.exon8;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t8417\t8464\t.\t-\t0\tID=AT1G01020.1.CDS8;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tID=AT1G01020.1.exon9;Parent=AT1G01020.1 Chr1\tTAIR10\tCDS\t8571\t8666\t.\t-\t0\tID=AT1G01020.1.CDS9;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tID=AT1G01020.1.exon10;Parent=AT1G01020.1 Chr1\tTAIR10\tfive_prime_UTR\t8667\t8737\t.\t-\t.\tID=AT1G01020.1.five_prime_UTR1;Parent=AT1G01020.1""" rp = set(real_printed.split("\n")) fp = set(str(self.tr).split("\n")) # print() # print(real_printed) # print("============") # print(str(self.tr)) # print("============") diff = "\n====\n".join(["\n".join(sorted(list(rp - set.intersection(rp, fp)))), "\n".join(sorted(list(fp - set.intersection(rp, fp))))]) self.assertEqual(real_printed, str(self.tr), diff) def test_empty(self): """ Test that the inference of exons is valid. :return: """ self.tr.exons = [] self.tr.finalized = False self.tr.finalize() self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.number_internal_orfs, 1) self.assertEqual(self.tr.exon_num, 10) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) exons = [(5928, 6263), (6437, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8737)] # exons = [intervaltree.Interval(*exon) for exon in exons] self.assertEqual(self.tr.exons, exons, self.tr.exons) # self.assertRaises(Mikado.exceptions.InvalidTranscript, self.tr.finalize) def test_invalid_utr(self): """ Test that a transcript with UTR but no CDS defined will raise an exception. :return: """ self.tr.combined_cds = [] self.tr.finalized = False self.assertRaises(Mikado.exceptions.InvalidTranscript, self.tr.finalize) def test_basics(self): """ Test basic assertions about the transcript: - chromosome (.chrom) should be Chr1 - strand should be - - number of internal orfs should be 1 - number of exons should be 10 - the metric "exon_num" should be 10 as well - start should be 5928 (1-based offset) - end should be 8737 - the exons should correspond to those in the original strings (defined here in the list) and all of them should be of the "Interval" class :return: """ self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.number_internal_orfs, 1) self.assertEqual(self.tr.exon_num, 10) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) exons = [(5928, 6263), (6437, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8737)] # exons = [intervaltree.Interval(*exon) for exon in exons] self.assertEqual(self.tr.exons, exons, self.tr.exons) def test_cds(self): self.assertEqual(sorted(self.tr.combined_cds), sorted(self.tr.selected_cds)) cds = [(6915, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8666)] self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 6915) def test_utr(self): self.assertEqual(self.tr.five_utr, [(8667, 8737)]) self.assertEqual(self.tr.three_utr, [(5928, 6263), (6437, 6914)]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.five_utr_num_complete, 0) self.assertEqual(self.tr.three_utr_num, 2) self.assertEqual(self.tr.three_utr_num_complete, 1) self.assertEqual(self.tr.five_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.three_utr_length, 6263 + 1 - 5928 + 6914 + 1 - 6437) self.assertEqual(self.tr.selected_start_distance_from_tss, 8738 - 8667, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_tes, 6263 + 1 - 5928 + 6915 - 6437, self.tr.selected_end_distance_from_tes) self.assertEqual(self.tr.selected_end_distance_from_junction, 6915 - 6437, self.tr.selected_cds_end) self.assertEqual(self.tr.end_distance_from_junction, self.tr.selected_end_distance_from_junction) def test_introns(self): introns = {(8465, 8570), (8326, 8416), (7988, 8235), (7836, 7941), (7650, 7761), (7451, 7563), (7233, 7383), (7070, 7156), (6264, 6436)} self.assertEqual(self.tr.introns, introns, self.tr.introns) cds_introns = {(8465, 8570), (8326, 8416), (7988, 8235), (7836, 7941), (7650, 7761), (7451, 7563), (7233, 7383), (7070, 7156)} self.assertEqual(self.tr.combined_cds_introns, cds_introns, self.tr.combined_cds_introns) selected_cds_introns = {(8465, 8570), (8326, 8416), (7988, 8235), (7836, 7941), (7650, 7761), (7451, 7563), (7233, 7383), (7070, 7156)} self.assertEqual(self.tr.selected_cds_introns, selected_cds_introns, self.tr.selected_cds_introns) # @unittest.SkipTest def test_strip_cds(self): """ Test the "stip_cds" function which (as the name implies) removes completely the CDS from a transcript. :return: """ with self.assertLogs("null", level="DEBUG") as log_split: self.tr.strip_cds() self.assertIn("DEBUG:null:Stripping CDS from AT1G01020.1", log_split.output) self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc.""" # tr = deepcopy(self.tr) self.tr.remove_utrs() # tr = deepcopy(self.tr) # tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.end, ((self.tr.selected_cds_start, self.tr.selected_cds_end), (self.tr.start, self.tr.end))) self.assertEqual(self.tr.selected_cds_end, self.tr.start) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) combined_cds = [(6915, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8666)] self.assertEqual(self.tr.combined_cds, combined_cds, self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_load_orf(self): """Test for loading a single ORF. We strip the CDS and reload it.""" self.tr.strip_cds() self.tr.load_orfs([self.orf]) combined_cds = [(6915, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8666)] self.assertEqual(self.tr.combined_cds, combined_cds, self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 6915) def test_negative_orf(self): """Test loading a negative strand ORF onto a multiexonic transcript. This should have no effect. """ self.orf.strand = "-" self.tr.strip_cds() self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.selected_cds_start, None) def testSegments(self): self.assertEqual(self.tr.combined_cds_num, 9) self.assertEqual(self.tr.selected_cds_num, 9) self.assertEqual(self.tr.highest_cds_exon_number, 9) self.assertEqual(self.tr.max_intron_length, 248) self.assertEqual(self.tr.number_internal_orfs, 1) def test_lengths(self): self.assertEqual(self.tr.cdna_length, 1623) self.assertEqual(self.tr.selected_cds_length, 738) self.assertAlmostEqual(self.tr.combined_cds_fraction, 738 / 1623, delta=0.01) self.assertAlmostEqual(self.tr.selected_cds_fraction, 738 / 1623, delta=0.01) def test_print_no_cds(self): self.maxDiff = None # tr = deepcopy(self.tr) # tr.finalize() real_printed = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1 Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.exon1;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tID=AT1G01020.1.exon2;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tID=AT1G01020.1.exon3;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tID=AT1G01020.1.exon4;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tID=AT1G01020.1.exon5;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tID=AT1G01020.1.exon6;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tID=AT1G01020.1.exon7;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tID=AT1G01020.1.exon8;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tID=AT1G01020.1.exon9;Parent=AT1G01020.1 Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tID=AT1G01020.1.exon10;Parent=AT1G01020.1""" self.assertEqual(real_printed, self.tr.format("gff", with_cds=False)) real_printed_gtf = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Name "AT1G01020.1"; Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";""" self.assertEqual(real_printed_gtf, self.tr.__str__(print_cds=False, to_gtf=True)) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 100 first_orf.thick_end = 501 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_starts = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid, (len(first_orf), first_orf.cds_len)) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 0 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 1 second_orf.thick_start = 300 second_orf.thick_end = 401 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_starts = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid, (len(second_orf), second_orf.cds_len)) self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 1000 third_orf.thick_end = 1602 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_starts = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid, (len(third_orf), third_orf.cds_len)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds( second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join( [str(x) for x in self.tr.internal_orfs])) self.assertEqual(self.tr.combined_cds_length, 1005) self.assertEqual(self.tr.selected_cds_length, 603) new_transcripts = sorted(self.tr.split_by_cds(), key=operator.attrgetter("start")) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertNotEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[0].cdna_length, 624, msg="{0}-{1}{2}".format( new_transcripts[0].start, new_transcripts[0].end, new_transcripts[0].strand, )) self.assertEqual(new_transcripts[0].start, self.tr.start) self.assertEqual(new_transcripts[0].end, 6724) self.assertEqual(new_transcripts[1].three_utr_length, 0) self.assertEqual(new_transcripts[1].end, 8737)
def setUp(self): self.conf = configurator.to_json(None) self.conf["reference"]["genome"] = self.__genomefile__.name self.logger = create_null_logger("prepare") self.conf["prepare"]["keep_redundant"] = True
class TranscriptTester(unittest.TestCase): tr_gff = """Chr1 TAIR10 mRNA 5928 8737 . . . ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1 TAIR10 exon 5928 8737 . . . Parent=AT1G01020.1""" tr_lines = tr_gff.split("\n") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub("\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines] for l in tr_gff_lines: assert l.header is False # print(l) logger = create_null_logger("null") def setUp(self): """Basic creation test.""" self.tr = Transcript() self.tr.logger = self.logger self.tr.chrom = "Chr1" self.tr.source = "TAIR10" self.tr.feature = "mRNA" self.tr.start = 5928 self.tr.end = 8737 self.tr.strand = "+" self.tr.add_exon((5928, 8737)) self.tr.score = None self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1" self.tr.add_exon((8571, 8666), "CDS") self.tr.finalize() self.orf = Mikado.parsers.bed12.BED12() self.orf.chrom = self.tr.id self.orf.start = 1 self.orf.end = self.tr.cdna_length self.orf.name = self.tr.id self.orf.strand = "+" self.orf.score = 0 self.orf.thick_start = 8571 - 5928 + 1 self.orf.thick_end = 8666 - 5928 + 1 self.orf.block_count = 1 self.orf.blockSize = self.tr.cdna_length self.orf.block_starts = 0 self.orf.has_start_codon = True self.orf.has_stop_codon = True self.orf.transcriptomic = True self.assertFalse(self.orf.invalid, self.orf.invalid_reason) self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0) def test_invalid_inizialization(self): with self.assertRaises(TypeError): _ = Mikado.loci.Transcript(self.tr_gff_lines[1]) def test_basics(self): self.assertEqual(self.tr.chrom, "Chr1") self.assertEqual(self.tr.exon_num, 1) self.assertEqual(self.tr.monoexonic, True) self.assertEqual(self.tr.exon_num, len(self.tr.exons)) self.assertEqual(self.tr.start, 5928) self.assertEqual(self.tr.end, 8737) self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons) def test_cds(self): """Test the CDS features. Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False. """ self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.combined_cds, self.tr.selected_cds) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.selected_cds_start, 8571) self.assertEqual(self.tr.selected_cds_end, 8666) self.assertEqual(self.tr.has_start_codon, True) self.assertEqual(self.tr.has_stop_codon, True) def test_equality(self): new_transcript = self.tr.deepcopy() self.assertTrue(new_transcript == self.tr) new_transcript.strand = None self.assertFalse( new_transcript == self.tr) # They have now a different strand new_transcript.unfinalize() new_transcript.strand = "+" # It becomes a multiexonic transcript, so it must have a strand new_transcript.end = 9737 new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1]) new_exon.strand = "+" new_exon.start = 9000 new_exon.end = 9737 new_transcript.add_exon(new_exon) new_transcript.finalize() self.assertTrue(new_transcript != self.tr) def test_mono_finalising(self): transcript_line = [ line for line in self.tr_gff_lines if line.feature == "mRNA" ] self.assertEqual(len(transcript_line), 1, "\n".join([str(line) for line in self.tr_gff_lines])) tr = Mikado.loci.Transcript(transcript_line[0]) exon_lines = [ line for line in self.tr_gff_lines if line.is_exon is True and "UTR" not in line.feature.upper() ] tr.add_exons(exon_lines) tr.add_exon((8571, 8666), "CDS") tr.finalize() self.assertGreater(tr.three_utr_length, 0) self.assertGreater(tr.five_utr_length, 0) def test_invalid_transcript(self): lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1 Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1; Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1""" gff_lines = [ Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n") ] self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine) checker = False if gff_lines[0].feature.endswith( "transcript") or "RNA" in gff_lines[0].feature.upper(): checker = True self.assertTrue(checker) self.assertTrue(gff_lines[0].is_transcript) transcript = Mikado.loci.Transcript(gff_lines[0]) transcript.logger = self.logger transcript.add_exons(gff_lines[1:]) with self.assertRaises(Mikado.exceptions.InvalidCDS): Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr( transcript) def test_utr(self): self.assertEqual( self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])), ("exon", tuple([5928, 8737])), ("CDS", tuple([8571, 8666]), 0), ("UTR", tuple([8667, 8737]))], "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)], self.tr.selected_internal_orf)) self.assertEqual( self.tr.combined_utr, [tuple([5928, 8570]), tuple([8667, 8737])]) self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])], self.tr.five_utr) self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])]) def test_utr_metrics(self): """Test for UTR exon num, start distance, etc.""" self.assertEqual(self.tr.five_utr_num, 1) self.assertEqual(self.tr.three_utr_num, 1) self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928) self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667) self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928, self.tr.selected_end_distance_from_tes) self.assertEqual( self.tr.selected_end_distance_from_tes, 8737 - 8666, (self.tr.selected_end_distance_from_tes, self.tr.strand)) def test_strip_cds(self): self.tr.strip_cds() self.assertEqual(self.tr.selected_cds_length, 0) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.selected_cds, []) self.assertEqual(self.tr.selected_cds_start, None) self.assertEqual(self.tr.selected_cds_end, None) def test_remove_utr(self): """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc. """ self.tr.remove_utrs() self.assertEqual(self.tr.selected_cds_start, self.tr.start) self.assertEqual(self.tr.selected_cds_end, self.tr.end) self.assertEqual(self.tr.three_utr, []) self.assertEqual(self.tr.five_utr, []) self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])], self.tr.combined_cds) self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr) def test_negative_orf(self): """Test loading a negative strand ORF onto a monoexonic transcript. This should reverse the ORF.""" self.orf.strand = "-" self.tr.strip_cds(strand_specific=False) self.orf.has_stop_codon = False self.tr.load_orfs([self.orf]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.selected_cds_start, 8666) self.assertEqual(self.tr.selected_cds_end, 8571) def test_introns(self): self.assertEqual(self.tr.introns, set([]), self.tr.introns) self.assertEqual(self.tr.combined_cds_introns, set([]), self.tr.combined_cds_introns) self.assertEqual(self.tr.selected_cds_introns, set([]), self.tr.selected_cds_introns) def testDoubleOrf(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds() self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "+" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "+" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) self.assertTrue( Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "+" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] self.tr.logger = self.logger self.tr.load_orfs([first_orf]) self.tr.load_orfs([second_orf]) self.tr.load_orfs([third_orf]) self.tr.load_orfs([first_orf, second_orf, third_orf]) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].three_utr_length, 0) self.assertEqual(new_transcripts[1].five_utr_length, 0) def testDoubleOrf_negative(self): """Test to verify the introduction of multiple ORFs.""" self.tr.strip_cds(strand_specific=False) self.tr.finalized = False first_orf = Mikado.parsers.bed12.BED12() first_orf.chrom = self.tr.id first_orf.start = 1 first_orf.end = self.tr.cdna_length first_orf.name = self.tr.id first_orf.strand = "-" first_orf.score = 0 first_orf.thick_start = 51 first_orf.thick_end = 398 first_orf.block_count = 1 first_orf.blockSize = self.tr.cdna_length first_orf.block_sizes = [self.tr.cdna_length] first_orf.block_starts = [0] first_orf.rgb = 0 first_orf.has_start_codon = True first_orf.has_stop_codon = True first_orf.transcriptomic = True self.assertFalse(first_orf.invalid) # This should not be incorporated second_orf = Mikado.parsers.bed12.BED12() second_orf.chrom = self.tr.id second_orf.start = 1 second_orf.end = self.tr.cdna_length second_orf.name = "second" second_orf.strand = "-" second_orf.score = 0 second_orf.thick_start = 201 second_orf.thick_end = 410 second_orf.block_count = 1 second_orf.blockSize = self.tr.cdna_length second_orf.block_sizes = [self.tr.cdna_length] second_orf.block_starts = [0] second_orf.rgb = 0 second_orf.has_start_codon = True second_orf.has_stop_codon = True second_orf.transcriptomic = True self.assertFalse(second_orf.invalid) # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, # second_orf)) # This should be added third_orf = Mikado.parsers.bed12.BED12() third_orf.chrom = self.tr.id third_orf.start = 1 third_orf.end = self.tr.cdna_length third_orf.name = "third" third_orf.strand = "-" third_orf.score = 0 third_orf.thick_start = 501 third_orf.thick_end = 800 third_orf.block_count = 1 third_orf.blockSize = self.tr.cdna_length third_orf.block_sizes = [self.tr.cdna_length] third_orf.block_starts = [0] third_orf.rgb = 0 third_orf.has_start_codon = True third_orf.has_stop_codon = True third_orf.transcriptomic = True self.assertFalse(third_orf.invalid) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf)) self.assertFalse( Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf)) self.assertFalse(third_orf == second_orf) self.assertFalse(first_orf == second_orf) self.assertFalse(first_orf == third_orf) candidates = [first_orf, second_orf, third_orf] # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2) self.tr.logger = self.logger self.tr.load_orfs(candidates) self.assertTrue(self.tr.is_complete) self.tr.finalize() self.assertEqual( self.tr.number_internal_orfs, 2, (self.tr.cdna_length, self.tr.selected_start_distance_from_tss, self.tr.selected_end_distance_from_tes)) # self.assertEqual(self.tr.combined_cds_length, 648) self.assertEqual(self.tr.selected_cds_length, 348) self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs])) new_transcripts = sorted(self.tr.split_by_cds()) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].five_utr_length, 0) self.assertEqual(new_transcripts[1].three_utr_length, 0) def test_wrong_orf(self): # This should be added orf = Mikado.parsers.bed12.BED12() orf.chrom = self.tr.id orf.start = 1 orf.end = self.tr.cdna_length + 1 orf.name = "third" orf.strand = "-" orf.score = 0 orf.thick_start = 501 orf.thick_end = 800 orf.block_count = 1 orf.blockSize = self.tr.cdna_length orf.block_sizes = [self.tr.cdna_length] orf.block_starts = [0] orf.rgb = 0 orf.has_start_codon = True orf.has_stop_codon = True orf.transcriptomic = True self.assertFalse(orf.invalid) self.tr.logger = self.logger self.tr.strip_cds() self.tr.strand = "+" self.logger.setLevel("WARNING") # self.tr.load_orfs([orf]) with self.assertLogs("null", level="DEBUG") as cm_out: self.tr.load_orfs([orf]) self.assertFalse(self.tr.is_coding)
def transfer_cds(transcript: Transcript, ref_cdna: str, ref_bed: BED12, target_cdna: str, target_bed: BED12, logger=create_null_logger()): if transcript is None: return transcript, target_bed, (None, None, False) transcript.finalize() assert target_bed.transcriptomic is True logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id, transcript.phases, target_bed.phase) if ref_bed.coding is False: logger.debug("%s is non coding, returning immediately.", transcript.id, transcript.phases) transcript.attributes["aligner_cds"] = False transcript.attributes["was_coding"] = transcript.is_coding target_bed.coding = False transcript.strip_cds() pep_coords = (None, None, True) else: original_start, original_end = target_bed.thick_start, target_bed.thick_end original_phase, original_phases = target_bed.phase, transcript.phases.copy( ) ref_pep = str( Seq.Seq(str( ref_cdna[ref_bed.thick_start - 1:ref_bed.thick_end])).translate(to_stop=False)) ref_has_multiple_stops = False if ref_pep.count("*") == 0: pass elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3): ref_pep = ref_pep[:ref_pep.index( "*")] # This is the "good" case: the CDS is correct. else: ref_has_multiple_stops = True logger.warning( "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.", ref_bed.name) logger.debug("%s now has phases: %s (%s)", transcript.id, transcript.phases, target_bed.phase) target_bed, pep_coords = transfer_by_alignment(ref_pep, target_cdna, target_bed, logger=logger) logger.debug("%s now has phases: %s; target bed: %s", transcript.id, transcript.phases, target_bed.phase) pep_coords = (pep_coords[0], pep_coords[1], (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep))) if target_bed.thick_start == original_start and target_bed.thick_end == original_end: transcript.attributes["aligner_cds"] = True logger.debug("%s now has phases: %s", transcript.id, transcript.phases) else: transcript.attributes["aligner_cds"] = False transcript.strip_cds() if target_bed.coding is True: transcript.load_orfs([target_bed]) logger.debug("%s now has phases: %s", transcript.id, transcript.phases) # Now we have to decide whether the transcript has the "original" CDS or not result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna), str(target_cdna)) ref_array, target_array = transfer.create_translation_array(cigar) try: target_start = target_array[ref_array.index(ref_bed.thick_start)] except IndexError: target_start = target_bed.start try: target_end = target_array[ref_array.index(ref_bed.thick_end)] except IndexError: target_end = target_bed.end if target_start == target_bed.thick_start and target_end == target_bed.thick_end: transcript.attributes["original_cds"] = True else: transcript.attributes["original_cds"] = False if ref_cdna == target_cdna: logger.debug("%s now has phases: %s", transcript.id, transcript.phases) if transcript.is_coding is False: raise AssertionError("{} not coding".format(transcript.id)) elif transcript.attributes["original_cds"] is False: raise AssertionError("\n".join([ str(_) for _ in [ transcript.id, (target_bed.thick_start, target_start, target_bed.thick_start == target_start), (target_bed.thick_end, target_end, target_bed.thick_end == target_end ), target_bed.thick_start == target_start and target_bed.thick_end == target_end ] ])) return transcript, target_bed, pep_coords
def transfer_by_alignment(ref_pep, target_cdna, target_bed, logger=create_null_logger()): frames = dict() # Get the three-frame translation logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase) for frame in range(3): frames[frame] = str( Seq.Seq(str(target_cdna[frame:])).translate(to_stop=False)) # This will get the best match in the 3-frame translation frame_res = dict() for frame in frames: res, cigar = transfer.get_and_prepare_cigar(ref_pep, frames[frame], open=3, extend=1, matrix=parasail.blosum85) frame_res[frame] = (res, cigar) # Now it is time to try to transfer it ... Ignore any deletions at the beginning cig_start = 0 translation_start = 0 logger.debug("Frames for %s (phase %s): %s", target_bed.name, target_bed.phase, frame_res) best_frame = sorted(frame_res.keys(), key=lambda k: frame_res[k][0].score, reverse=True)[0] best_cigar = frame_res[best_frame][1] logger.debug("Best frame for %s: %s (cigar: %s)", target_bed.name, best_frame, best_cigar) for cig_pos, cig in enumerate(best_cigar): le, op = cig if not transfer.op_consumes[op][0]: # Pass by deletions translation_start += best_cigar[cig_start][0] cig_start += 1 continue else: if transfer.op_consumes[op][1]: # anslation_start += best_cigar[cig_start][0] break else: cig_start += 1 continue # This is 0-based; we have to add 1 because we start 1 base after the gap at the beginning logger.debug("Translation start for %s: %s; phase: %s", target_bed.name, translation_start, target_bed.phase) if translation_start > 0: translation_start = 3 * translation_start + best_frame else: # We have to account for the frame! translation_start = best_frame translated = str( Seq.Seq(str(target_cdna[translation_start:])).translate( to_stop=(ref_pep.count("*") <= 1))) # Logic to handle when the CDS is broken # This is 1-based, so we have to add 1 to target_bed.thick_start = translation_start + 1 end = target_bed.thick_start + len(translated) * 3 - 1 logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase) if translated and translated[0] != ref_pep[0]: if translation_start in (0, 1, 2): target_bed.phase = translation_start target_bed.thick_start = 1 else: target_bed.coding = False return target_bed, (None, None, False) elif not translated: target_bed.coding = False return target_bed, (None, None, False) # Get the coordinates on the original protein pep_res, pep_cigar = transfer.get_and_prepare_cigar( ref_pep, translated, open=3, extend=1, matrix=parasail.blosum85) pep_ref_array, pep_target_array = transfer.create_translation_array( pep_cigar) pep_start, pep_end = None, None for pos in range(1, len(pep_ref_array) + 1): if pep_ref_array[pos - 1] and pep_target_array[pos - 1]: if not pep_start: pep_start = pos pep_end = pos # Now check whether we can add the stop codon if end + 3 < len(target_cdna): end += 3 else: # Here we have to presume that it is open. end = len(target_cdna) # print(translation_start * 3, translated) target_bed.thick_end = end target_bed.coding = True target_bed.transcriptomic = True logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase) return target_bed, (pep_start, pep_end)
class LocusTester(unittest.TestCase): logger = create_null_logger("locus_tester") def setUp(self): gff_transcript1 = """Chr1\tfoo\ttranscript\t101\t300\t.\t+\t.\tID=t0 Chr1\tfoo\texon\t101\t300\t.\t+\t.\tID=t0:exon1;Parent=t0 Chr1\tfoo\tCDS\t101\t250\t.\t+\t.\tID=t0:exon1;Parent=t0""".split("\n") gff_transcript1 = [GFF.GffLine(x) for x in gff_transcript1] self.assertEqual(gff_transcript1[0].chrom, "Chr1", gff_transcript1[0]) self.transcript1 = Transcript(gff_transcript1[0]) for exon in gff_transcript1[1:]: self.transcript1.add_exon(exon) self.transcript1.finalize() self.assertTrue(self.transcript1.monoexonic) self.assertEqual(self.transcript1.chrom, gff_transcript1[0].chrom) gff_transcript2 = """Chr1\tfoo\ttranscript\t101\t600\t.\t+\t.\tID=t1 Chr1\tfoo\texon\t101\t200\t.\t+\t.\tID=t1:exon1;Parent=t1 Chr1\tfoo\texon\t301\t400\t.\t+\t.\tID=t1:exon2;Parent=t1 Chr1\tfoo\texon\t501\t600\t.\t+\t.\tID=t1:exon3;Parent=t1""".split("\n") gff_transcript2 = [GFF.GffLine(x) for x in gff_transcript2] self.transcript2 = Transcript(gff_transcript2[0], logger=self.logger) for exon in gff_transcript2[1:-1]: self.transcript2.add_exon(exon) # Test that a transcript cannot be finalized if # the exons do not define the external boundaries with self.assertLogs("null", level="WARNING") as _: self.transcript2.finalize() with self.assertRaises(exceptions.ModificationError): self.transcript2.add_exon(gff_transcript2[-1]) self.transcript2.finalized = False self.transcript2.start = 101 self.transcript2.end = 600 self.transcript2.add_exon(gff_transcript2[-1]) self.transcript2.finalize() self.assertFalse(self.transcript2.monoexonic) self.assertEqual(self.transcript2.exon_num, len(gff_transcript2) - 1) # Test that trying to modify a transcript after it has been finalized causes errors with self.assertRaises(exceptions.ModificationError): for exon in gff_transcript2[1:]: self.transcript2.add_exon(exon) # Test that creating a superlocus without configuration fails with self.assertRaises(exceptions.NoJsonConfigError): _ = Superlocus(self.transcript1) self.my_json = os.path.join(os.path.dirname(__file__), "configuration.yaml") self.my_json = configurator.to_json(self.my_json) self.assertIn("scoring", self.my_json, self.my_json.keys()) def test_locus(self): """Basic testing of the Locus functionality.""" logger = create_null_logger("null") logger.setLevel("WARNING") logger.info("Started") slocus = Superlocus(self.transcript1, json_conf=self.my_json, logger=logger) slocus.add_transcript_to_locus(self.transcript2) self.assertEqual(slocus.strand, self.transcript1.strand) self.assertEqual(slocus.start, min(self.transcript1.start, self.transcript2.start)) self.assertEqual(slocus.end, max(self.transcript1.end, self.transcript2.end)) logger.info(slocus.transcripts) slocus.define_subloci() logger.info(slocus.subloci) logger.info(slocus.transcripts) self.assertEqual(len(slocus.transcripts), 2) self.assertEqual(len(slocus.subloci), 2) slocus.define_monosubloci() self.assertEqual(len(slocus.monosubloci), 2) slocus.define_loci() self.assertEqual(len(slocus.loci), 1) self.assertEqual( list(slocus.loci[list( slocus.loci.keys())[0]].transcripts.keys())[0], "t0") gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0 Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split( "\n") gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3] transcript3 = Transcript(gff_transcript3[0]) for exon in gff_transcript3[1:]: transcript3.add_exon(exon) transcript3.finalize() minusuperlocus = Superlocus(transcript3, json_conf=self.my_json) minusuperlocus.define_loci() self.assertEqual(len(minusuperlocus.loci), 1) self.assertTrue(transcript3.strand != self.transcript1.strand)
class ASeventsTester(unittest.TestCase): logger = create_null_logger("ASevents") def setUp(self): self.conf = dict() self.conf["pick"] = dict() self.conf["pick"]["alternative_splicing"] = dict() self.conf["pick"]["alternative_splicing"]["max_utr_length"] = 10000 self.conf["pick"]["alternative_splicing"]["max_fiveutr_length"] = 10000 self.conf["pick"]["alternative_splicing"][ "max_threeutr_length"] = 10000 self.conf["pick"]["alternative_splicing"]["valid_ccodes"] = [ "j", "J", "O", "mo" ] self.conf["pick"]["alternative_splicing"]["redundant_ccodes"] = [ "c", "=", "_", "m" ] self.conf["pick"]["alternative_splicing"][ "only_confirmed_introns"] = False self.conf["pick"]["alternative_splicing"]["min_score_perc"] = 0.5 self.conf["pick"]["alternative_splicing"][ "keep_retained_introns"] = True self.conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.2 self.conf["pick"]["alternative_splicing"]["min_cds_overlap"] = 0.2 self.conf["pick"]["alternative_splicing"]["max_isoforms"] = 3 self.t1 = Transcript() self.t1.chrom = "Chr1" self.t1.strand = "+" self.t1.score = 20 self.t1.id = "G1.1" self.t1.parent = "G1" self.t1.start = 101 self.t1.end = 1500 self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)], "exon") self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") self.t1.finalize() self.locus = Locus(self.t1) self.locus.logger = self.logger self.locus.json_conf = self.conf def test_not_intersecting(self): # This one is contained and should be rejected t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G1.1" t2.parent = "G1" t2.start = 601 t2.end = 1420 t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon") t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (False, "c")) def test_valid_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (True, "J")) self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) def test_redundant_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) t3 = Transcript() t3.chrom = "Chr1" t3.strand = "+" t3.score = 20 t3.id = "G3.1" t3.parent = "G3" t3.start = 201 t3.end = 1630 t3.add_exons([(201, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1630)], "exon") t3.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t3.finalize() self.assertEqual( self.locus.is_alternative_splicing(t3)[:2], (False, "J")) self.locus.add_transcript_to_locus(t3) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) def test_non_redundant_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) t3 = Transcript() t3.chrom = "Chr1" t3.strand = "+" t3.score = 20 t3.id = "G3.1" t3.parent = "G3" t3.start = 201 t3.end = 1630 t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460), (1501, 1630)], "exon") t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)], "CDS") t3.logger = self.logger t3.finalize() self.assertEqual( self.locus.is_alternative_splicing(t3)[:2], (True, "j")) self.locus.add_transcript_to_locus(t3) self.assertEqual(len(self.locus.transcripts), 3, self.locus.transcripts) def test_lowscore(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts)