class TestRetrieval(unittest.TestCase): def setUp(self): self.tr = Transcript() self.tr.chrom = "Chr1" self.tr.start = 101 self.tr.end = 2000 self.tr.strand = None self.tr.add_exons([(101, 2000)]) self.tr.id = "test1" self.tr.parent = "gene1" self.tr.finalize() conf = to_json( os.path.join(os.path.dirname(__file__), "configuration.yaml")) self.assertTrue(conf["pick"]["chimera_split"]["blast_check"]) self.assertTrue(conf["pick"]["chimera_split"]["execute"]) self.assertEqual( conf["pick"]["chimera_split"]["blast_params"]["leniency"], "LENIENT") conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50 self.tr.json_conf = conf def test_load_pos_and_neg(self): b1 = BED12(transcriptomic=True) b1.chrom = self.tr.id b1.start = 0 b1.end = self.tr.cdna_length - 1 b1.strand = "+" b1.name = "first" b1.thick_start = 101 b1.thick_end = 190 self.assertFalse(b1.invalid) b2 = b1.copy() b2.strand = "-" b2.thick_start = 1 b2.thick_end = 87 b2.name = "second" self.assertFalse(b2.invalid) with self.assertLogs("null", "DEBUG") as _: after_overlap_check = retrieval.find_overlapping_cds( self.tr, [b1, b2]) # print(*_.output, sep="\n") self.assertEqual(len(after_overlap_check), 2, self.tr.json_conf["pick"]["orf_loading"]) self.assertEqual(after_overlap_check, [b1, b2], [_.name for _ in after_overlap_check]) retrieval.load_orfs(self.tr, [b1, b2]) self.assertEqual(self.tr.number_internal_orfs, 1) self.assertEqual(self.tr.combined_cds_start, 201, self.tr.combined_cds_start) self.assertEqual(self.tr.combined_cds_length, 90) def test_connect(self): retrieval._connect_to_db(self.tr) reflector = reflection.Inspector.from_engine(self.tr.engine)
class ExternalTester(unittest.TestCase): def setUp(self): self.transcript = Transcript() self.transcript.chrom = "15" self.transcript.source = "protein_coding" self.transcript.start = 47631264 self.transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] self.transcript.strand = "+" self.transcript.add_exons(exons) self.transcript.id = "ENST00000560636" self.transcript.parent = "ENSG00000137872" def test_copying(self): self.transcript.external_scores.update({"test": 0, "test1": 1}) self.assertEqual(self.transcript.external_scores.test, 0) self.assertEqual(self.transcript.external_scores.test1, 1) transcript = self.transcript.deepcopy() self.assertEqual(transcript.external_scores.test, 0) self.assertEqual(transcript.external_scores.test1, 1)
def test_intronMatch(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1510)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger))
def test_regression(self): sequence = """TC CTCACAGTTACTATAAGCTCGTCT ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC CGTTGACTATCTCGCCTGA""" record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)), id="class_Chr1.1006.0") index = {record.id: record} line = "\t".join( ['class_Chr1.1006.0', '0', '619', 'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)', '0', '+', '2', '617', '0', '1', '619', '0']) # Now we are going back to find the start codon bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index, max_regression=0.2) self.assertFalse(bed_line.invalid, bed_line.invalid_reason) self.assertEqual(bed_line.phase, 0) # Start codon in frame found at location 27 self.assertEqual(bed_line.thick_start, 27) self.assertTrue(bed_line.has_start_codon) self.assertFalse(bed_line.has_stop_codon) lines = """Chr1 CLASS transcript 3442811 3443785 1000 - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0"; Chr1 CLASS exon 3442811 3442999 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443099 3443169 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443252 3443329 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443417 3443493 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443582 3443785 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";""" lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _] transcript = Transcript(lines[0]) transcript.add_exons(lines[1:]) transcript.finalize() transcript.load_orfs([bed_line]) self.assertTrue(transcript.is_coding) self.assertTrue(transcript.has_start_codon) self.assertFalse(transcript.has_stop_codon) self.assertEqual(transcript.selected_cds_end, transcript.start) self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
def test_wrong_cds(self): transcript = Transcript() transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" cds_line = "\t".join([ "15", "protein_coding", "CDS", "48051996", "48051996", ".", "+", "0", "ID=ENST00000560636.cds1;Parent=ENST00000560636" ]) cds_line = GffLine(cds_line) transcript.add_exon(cds_line) logger = Mikado.utilities.log_utils.create_null_logger() transcript.logger = logger with self.assertLogs("null", level="WARNING"): transcript.finalize() trimmed = trim_coding(transcript, logger, max_length=50) self.assertEqual(trimmed.start, 47631366) self.assertEqual(trimmed.end, 48051992)
def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n"+"\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" # Phase 0, 0 because the first CDS exon is 300bp gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual( gff, res, "++++\n\n" + "\n+++\n".join( [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
class TestRetrieval(unittest.TestCase): def setUp(self): self.tr = Transcript() self.tr.chrom = "Chr1" self.tr.start = 101 self.tr.end = 2000 self.tr.strand = None self.tr.add_exons([(101, 2000)]) self.tr.id = "test1" self.tr.parent = "gene1" self.tr.finalize() conf = to_json(os.path.join( os.path.dirname(__file__), "configuration.yaml" )) self.assertTrue(conf["pick"]["chimera_split"]["blast_check"]) self.assertTrue(conf["pick"]["chimera_split"]["execute"]) self.assertEqual(conf["pick"]["chimera_split"]["blast_params"]["leniency"], "LENIENT") conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50 self.tr.json_conf = conf def test_load_pos_and_neg(self): b1 = BED12(transcriptomic=True) b1.chrom = self.tr.id b1.start = 0 b1.end = self.tr.cdna_length - 1 b1.strand = "+" b1.name = "first" b1.thick_start = 101 b1.thick_end = 190 self.assertFalse(b1.invalid) b2 = b1.copy() b2.strand = "-" b2.thick_start = 1 b2.thick_end = 87 b2.name = "second" self.assertFalse(b2.invalid) with self.assertLogs("null", "DEBUG") as _: after_overlap_check = retrieval.find_overlapping_cds(self.tr, [b1, b2]) # print(*_.output, sep="\n") self.assertEqual(len(after_overlap_check), 2, self.tr.json_conf["pick"]["orf_loading"]) self.assertEqual(after_overlap_check, [b1, b2], [_.name for _ in after_overlap_check]) retrieval.load_orfs(self.tr, [b1, b2]) self.assertEqual(self.tr.number_internal_orfs, 1) self.assertEqual(self.tr.combined_cds_start, 201, self.tr.combined_cds_start) self.assertEqual(self.tr.combined_cds_length, 90) def test_connect(self): retrieval._connect_to_db(self.tr) reflector = reflection.Inspector.from_engine(self.tr.engine)
def test_coding_positive(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "+" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_noncoding(self): transcript = Transcript() transcript.chrom = "Chr1" transcript.source = "test" transcript.start = 10000 transcript.end = 20000 exons = [(10000, 11500), (12000, 13000), (15000, 18000), (19000, 20000)] transcript.add_exons(exons) transcript.strand = "+" transcript.finalize() logger = Mikado.utilities.log_utils.create_null_logger("correct_cds2") copied = transcript.deepcopy() trimmed = trim_noncoding(copied, max_length=50) self.assertEqual(trimmed.start, 11450) self.assertEqual(trimmed.end, 19050) copied = transcript.deepcopy() trimmed = trim_noncoding(copied, max_length=200) self.assertEqual(trimmed.start, 11300) self.assertEqual(trimmed.end, 19200)
def test_regression(self): sequence = """TC CTCACAGTTACTATAAGCTCGTCT ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC CGTTGACTATCTCGCCTGA""" record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)), id="class_Chr1.1006.0") index = {record.id: record} line = "\t".join([ 'class_Chr1.1006.0', '0', '619', 'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)', '0', '+', '2', '617', '0', '1', '619', '0' ]) # Now we are going back to find the start codon bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index, max_regression=0.2) self.assertFalse(bed_line.invalid, bed_line.invalid_reason) self.assertEqual(bed_line.phase, 0) # Start codon in frame found at location 27 self.assertEqual(bed_line.thick_start, 27) self.assertTrue(bed_line.has_start_codon) self.assertFalse(bed_line.has_stop_codon) lines = """Chr1 CLASS transcript 3442811 3443785 1000 - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0"; Chr1 CLASS exon 3442811 3442999 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443099 3443169 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443252 3443329 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443417 3443493 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443582 3443785 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";""" lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _] transcript = Transcript(lines[0]) transcript.add_exons(lines[1:]) transcript.finalize() transcript.load_orfs([bed_line]) self.assertTrue(transcript.is_coding) self.assertTrue(transcript.has_start_codon) self.assertFalse(transcript.has_stop_codon) self.assertEqual(transcript.selected_cds_end, transcript.start) self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
def test_mixed_strands(self): """Verify that no retained intron is called if the strands are mixed.""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "-", "t2" t2.add_exons([(601, 1000), (1201, 1300), (1501, 1800)]) t2.add_exons( [ (1501, 1530), # 30 (1201, 1300), # 100 (771, 1000) # 230 ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json, stranded=False) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
def test_noIntronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1800, 2000)]) t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_not_retained_neg(self): """Here we verify that a false retained intron is not called as such""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "-", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "-", "t2" t2.add_exons([(301, 1000), (1201, 1300), (1501, 1800)]) t2.add_exons( [ (1501, 1530), # 30 (1201, 1300), # 100 (471, 1000) # 230 ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
def test_exon_switching_pos(self): """Checking that an exon switching is treated correctly as a NON-retained intron. Positive strand case""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (2501, 2800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (2501, 2530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "+", "t2" t2.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t2.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
def test_no_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1600 t2.end = 2000 t2.add_exons([(1600, 1700), (1801, 2000)]) t2.add_exons([(1661, 1700), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_intronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)]) t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_same_id(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G1.1" t2.parent = "G1" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() # This fails because they have the same ID self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_not_intersecting(self): # This one is contained and should be rejected t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G1.1" t2.parent = "G1" t2.start = 601 t2.end = 1420 t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon") t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (False, "c"))
def testCdsOverlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_lowscore(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts)
def test_real_retained_pos_noCDS(self): """Here we verify that a real retained intron is called as such, even when the transcript lacks a CDS""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "+", "t2" t2.add_exons([(101, 500), (801, 1000), (1201, 1600)]) # t2.add_exons([(201, 500), # 300 # (801, 1000), # 200 # (1201, 1420), # 220 # ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_introns, ((1201, 1600), ))
def test_caseNegative(self): tr = Transcript() tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "-" tr.id = "test1" tr.add_exons([(101, 300), (401, 600), (801, 1200), (2501, 3000) ]) tr.add_exons([(421, 600), # 180 (801, 1200), # 400 (2501, 2700) # 200 = 780 % 3 == 0 ], features="CDS") with self.assertLogs("null", "DEBUG") as _: tr.finalize() self.assertTrue(tr.is_coding) b12 = tr.as_bed12() self.assertEqual(b12.thick_start, tr.combined_cds_end) self.assertEqual(b12.thick_end, tr.combined_cds_start) self.assertEqual(len(b12.block_sizes), tr.exon_num) self.assertEqual(b12.block_sizes, [200, 200, 400, 500], b12.block_sizes) self.assertEqual(b12.strand, "-") self.assertEqual(b12.block_starts, [0, 300, 700, 2400], b12.block_starts) self.assertEqual(tr.format("bed12"), str(b12)) self.assertEqual(str(b12), "\t".join([str(_) for _ in ["Chr1", 100, 3000, tr.id, 0, tr.strand, b12.thick_start - 1, b12.thick_end, 0, 4, ",".join([str(__) for __ in [200, 200, 400, 500]]), ",".join([str(___) for ___ in [0, 300, 700, 2400]])]] ))
def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger))
def test_valid_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (True, "J")) self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts)
def test_correct_cds(self): transcript = Transcript() transcript.chrom = "Chr1" transcript.source = "test" transcript.start = 10000 transcript.end = 20000 exons = [(10000, 11500), (12000, 13000), (15000, 18000), (19000, 20000)] cds = [ (11400, 11500), # 101 (12000, 13000), # 1001 ==> 1102 (15000, 17998) ] # 2998 == > 3090 (y) transcript.add_exons(exons) transcript.add_exons(cds, features="CDS") transcript.strand = "+" transcript.finalize() logger = Mikado.utilities.log_utils.create_null_logger("correct_cds") copied = transcript.deepcopy() trimmed = trim_coding(copied, logger, max_length=50) self.assertEqual(trimmed.start, 11400) self.assertEqual(trimmed.end, 19050) copied = transcript.deepcopy() self.assertEqual(copied.start, 10000) trimmed = trim_coding(copied, logger, max_length=200) self.assertEqual(trimmed.start, 11300) self.assertEqual(trimmed.end, 19200)
def test_non_redundant_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) t3 = Transcript() t3.chrom = "Chr1" t3.strand = "+" t3.score = 20 t3.id = "G3.1" t3.parent = "G3" t3.start = 201 t3.end = 1630 t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460), (1501, 1630)], "exon") t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)], "CDS") t3.logger = self.logger t3.finalize() self.assertEqual( self.locus.is_alternative_splicing(t3)[:2], (True, "j")) self.locus.add_transcript_to_locus(t3) self.assertEqual(len(self.locus.transcripts), 3, self.locus.transcripts)
def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
class WrongLoadedOrf(unittest.TestCase): def setUp(self): self.tr = Transcript() self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000, "Chr1", "+") self.tr.id = "test1" self.tr.add_exons([(101, 400), (701, 1000)]) self.tr.finalize() def test_load_invalid_length(self): b_invalid = BED12(transcriptomic=True) b_invalid.chrom = self.tr.id self.assertTrue(b_invalid.transcriptomic) # b_invalid.name = self.tr.id b_invalid.start = 0 b_invalid.strand = "+" b_invalid.end = self.tr.cdna_length + 10 b_invalid.thick_start = 101 b_invalid.thick_end = 190 self.assertEqual(b_invalid.chrom, b_invalid.id, b_invalid.id) with self.assertLogs("null", "WARNING") as cm: retrieval.load_orfs(self.tr, [b_invalid]) found_message = False for _ in cm.output: if "Wrong ORF for {}:".format(self.tr.id) in _: found_message = True break self.assertTrue(found_message, cm.output) def test_load_invalid_multiple(self): b_valid = BED12(transcriptomic=True) b_valid.chrom = self.tr.id b_valid.name = "valid" b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+" b_valid.thick_start, b_valid.thick_end = 101, 190 b_invalid = b_valid.copy() b_invalid.name = "invalid" b_invalid.thick_start = 1 b_invalid.thick_end = 89 b_invalid.phase = 0 self.assertTrue(b_invalid.invalid) self.assertFalse(b_valid.invalid, b_valid.invalid_reason) with self.assertLogs("null", "DEBUG") as _: retrieval.load_orfs(self.tr, [b_valid, b_invalid]) # print(*cm.output, sep="\n") self.assertEqual(self.tr.number_internal_orfs, 1) def test_filter_non_transcriptomic(self): b_valid = BED12(transcriptomic=True) b_valid.chrom = self.tr.id b_valid.name = "valid" b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+" b_valid.thick_start, b_valid.thick_end = 101, 190 b_invalid = b_valid.copy() b_invalid.name = "non-transcriptomic" b_invalid.transcriptomic = False retained = retrieval.find_overlapping_cds(self.tr, [b_invalid, b_valid]) self.assertEqual(retained, [b_valid])
class PhaseChecker(unittest.TestCase): logger = create_default_logger("pcheck") logger.setLevel("DEBUG") def setUp(self): lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 mRNA 40282 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 40282 40933 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 three_prime_UTR 40282 40720 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 40721 40933 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41018 41111 . - 1 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41018 41111 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41227 41468 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41227 41468 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41673 41831 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41673 41831 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41946 42820 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41946 42820 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 42905 42913 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 42905 42913 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45373 45496 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45373 45496 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45600 45651 . - 1 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45600 45651 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45726 45726 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45726 45726 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45875 45893 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45875 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 five_prime_UTR 45894 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2""" lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _] self.transcript = Transcript(lines[0], logger=self.logger) self.transcript.add_exons(lines[1:]) self.correct_phases = {(40721, 40933): 2, (41018, 41111): 0, (41227, 41468): 2, (41673, 41831): 2, (41946, 42820): 1, (42905, 42913): 1, (45373, 45496): 2, (45600, 45651): 0, (45726, 45726): 2, (45875, 45893): 0} @unittest.skip def test_check_phases(self): self.transcript.finalize() phases = dict((_[1], _[2]) for _ in self.transcript.internal_orfs[0] if _[0] == "CDS") self.assertEqual(self.transcript.combined_cds_start, 45893) self.assertEqual(phases.keys(), self.correct_phases.keys(), list(zip(sorted(phases.keys()), sorted(self.correct_phases.keys())))) if self.correct_phases != phases: for key in sorted(phases.keys(), reverse=True): self.assertEqual(phases[key], self.correct_phases[key], (key, phases[key], self.correct_phases[key])) self.assertEqual(self.correct_phases, phases, (self.correct_phases, phases))
class MonoHolderTester(unittest.TestCase): logger = create_default_logger("MonoHolderTester") def setUp(self): self.conf = dict() self.t1 = Transcript() self.t1.chrom = "Chr1" self.t1.strand = "+" self.t1.score = 20 self.t1.id = "G1.1" self.t1.parent = "G1" self.t1.start = 101 self.t1.end = 1500 self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)], "exon") self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") self.t1.finalize() def testCdsOverlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_intronMatch(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1510)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_intronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)]) t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noIntronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1800, 2000)]) t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_no_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1600 t2.end = 2000 t2.add_exons([(1600, 1700), (1801, 2000)]) t2.add_exons([(1661, 1700), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_same_id(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G1.1" t2.parent = "G1" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() # This fails because they have the same ID self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_add_two_partials(self): logger = create_null_logger("test_add_two_partials") logger.setLevel("INFO") json_conf = load_and_validate_config(None) json_conf.reference.genome = self.fai json_conf.pick.alternative_splicing.only_confirmed_introns = False json_conf.pick.run_options.only_reference_update = True ref = Transcript(is_reference=True) ref.chrom, ref.strand, ref.id = "Chr5", "-", "AT5G66670.2" ref.add_exons([(26611258, 26612889)]) ref.add_exons([(26611474, 26612700)], features=["CDS"]) ref.finalize() self.assertTrue(ref.is_coding) # Chr5 TAIR10 mRNA 26611258 26612889 . - . ID=AT5G66670.2;Parent=AT5G66670;Name=AT5G66670.2;index=1 # Chr5 TAIR10 protein 26611474 26612700 . - . ID=AT5G66670.2-Protein;Parent=AT5G66670.2;Name=AT5G66670.2;derives_from=AT5G66670.2 # Chr5 TAIR10 three_prime_UTR 26611258 26611473 . - . Parent=AT5G66670.2 # Chr5 TAIR10 CDS 26611474 26612700 . - 0 Parent=AT5G66670.2 # Chr5 TAIR10 five_prime_UTR 26612701 26612889 . - . Parent=AT5G66670.2 # Chr5 TAIR10 exon 26611258 26612889 . - . Parent=AT5G66670.2 template1 = Transcript(is_reference=False) template1.chrom, template1.strand, template1.id = ref.chrom, ref.strand, ref.id + "_frag1" template1.add_exons(((26611116, 26611157), (26611258, 26612670))) template1.add_exons(((26611474, 26612670), ), features=["CDS"]) template1.finalize() self.assertTrue(template1.is_coding) template2 = Transcript(is_reference=False) template2.chrom, template2.strand, template2.id = ref.chrom, ref.strand, ref.id + "_frag2" template2.add_exons(((26611574, 26612889), (26613007, 26613403))) template2.add_exons(((26611574, 26612700), ), features=["CDS"]) template2.finalize() self.assertTrue(template2.is_coding) logger.setLevel("INFO") json_conf.pick.alternative_splicing.pad = True locus = Locus(ref, configuration=json_conf, logger=logger) locus.add_transcript_to_locus(template1) locus.add_transcript_to_locus(template2) self.assertIn(template2.id, locus) # self.assertIn(template1.id, locus) # locus.logger.setLevel("DEBUG") # for tid in locus: # locus[tid].logger.setLevel("DEBUG") locus.finalize_alternative_splicing(check_requirements=False) self.assertTrue(locus._finalized) self.assertNotIn(template1.id, locus, "\n" + str(locus)) self.assertNotIn(template2.id, locus, "\n" + str(locus)) self.assertEqual( locus[ref.id].end, template2.end, ((locus[ref.id].end, ref.end, template2.end, template1.end), (locus[ref.id].start, ref.start, template2.start, template1.start)))
class TestMetricsEndDistances(unittest.TestCase): logger = create_default_logger("End") logger.setLevel("ERROR") def setUp(self): self.tr = Transcript() self.tr.logger = self.logger self.tr.start = 101 self.tr.end = 10000 self.tr.add_exons([(101, 300), (501, 800), (1001, 1200), (1301, 2000), (3501, 5000), (5501, 6000), (6201, 7000), (7301, 7700), (8201, 9000), (9101, 9300), (9501, 9700), (9801, 10000)]) self.tr.id = "test1" self.tr.parent = "test1.gene" def test_end_positive(self): self.tr.strand = "+" cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 9130) self.assertEqual(self.tr.end_distance_from_junction, (9300 - 9131 + 1) + (9700 - 9501 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9690) # 190 % 3 == 1 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9690) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (9700 - 9691 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9700 - 9691 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9700), # 200 % 3 == 2 (9801, 9820), # 20 % 2 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9820) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 180) self.assertEqual(self.tr.end_distance_from_junction, 0) def test_end_negative(self): self.tr.strand = "-" # self.tr.add_exons([(101, 300), # (501, 800), # (1001, 1200), # (1301, 2000), # (3501, 5000), # (5501, 6000), # (6201, 7000), # (7301, 7700), # (8201, 9000), # (9101, 9300), # (9501, 9700), # (9801, 10000)]) cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0) self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertTrue(self.tr.is_coding) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 1161) self.assertEqual(self.tr.end_distance_from_junction, (1161-1001) + (800-501+1), (self.tr.end_distance_from_junction, (1161-1001) + (800-501+1)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(721, 800), (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 200 % 3 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 721) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (721-501), (self.tr.end_distance_from_junction, (721-501)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(161, 300), # 140 % 3 == 2 (501, 800), # 300 % 3 == 0 (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 30 % 3 == 0 ] self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0) self.tr.logger = self.logger self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 161) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 60) self.assertEqual(self.tr.end_distance_from_junction, 0)