def test_exon_switching_pos_noCDS(self): """Checking that an exon switching is treated correctly as a NON-retained intron even when the CDS is absent. Positive strand case""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (2501, 2800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (2501, 2530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "+", "t2" t2.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) # t2.add_exons([(201, 500), # 300 # (801, 1000), # 200 # (1201, 1300), # 100 # (1501, 1530) # 30 # ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0, sup.transcripts["t2"].retained_introns)
def test_not_retained_neg(self): """Here we verify that a false retained intron is not called as such""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "-", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "-", "t2" t2.add_exons([(301, 1000), (1201, 1300), (1501, 1800)]) t2.add_exons( [ (1501, 1530), # 30 (1201, 1300), # 100 (471, 1000) # 230 ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
def test_real_retained_pos_noCDS(self): """Here we verify that a real retained intron is called as such, even when the transcript lacks a CDS""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "+", "t2" t2.add_exons([(101, 500), (801, 1000), (1201, 1600)]) # t2.add_exons([(201, 500), # 300 # (801, 1000), # 200 # (1201, 1420), # 220 # ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_introns, ((1201, 1600), ))
def test_mixed_strands(self): """Verify that no retained intron is called if the strands are mixed.""" t1 = Transcript() t1.chrom, t1.strand, t1.id = 1, "+", "t1" t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)]) t1.add_exons( [ (201, 500), # 300 (801, 1000), # 200 (1201, 1300), # 100 (1501, 1530) # 30 ], features="CDS") t1.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.id = 1, "-", "t2" t2.add_exons([(601, 1000), (1201, 1300), (1501, 1800)]) t2.add_exons( [ (1501, 1530), # 30 (1201, 1300), # 100 (771, 1000) # 230 ], features="CDS") t2.finalize() sup = Superlocus(t1, json_conf=self.my_json, stranded=False) sup.add_transcript_to_locus(t2) sup.find_retained_introns(t2) self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
def test_load_orfs(self): transcript_line = 'Chr1\t100\t2000\tID=foo;coding=True;phase=0'\ '\t0\t+\t300\t1850\t0\t4\t400,400,400,200\t0,500,1100,1700' transcript = Transcript(transcript_line) orf = transcript.orfs[0].to_transcriptomic() transcript2 = transcript.copy() transcript2.unfinalize() transcript2.chrom = "Chr2" transcript2.id = "foo.2" transcript2.finalize() other_orf = transcript2.orfs[0].to_transcriptomic() engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) session.add_all([query, query2]) session.commit() serialized_orf = Orf(orf, query.query_id) self.assertEqual(serialized_orf.thick_end, orf.thick_end) self.assertEqual(serialized_orf.cds_len, orf.cds_len) serialized_other_orf = Orf(other_orf, query2.query_id) session.add_all([serialized_orf, serialized_other_orf]) session.commit() sup = Superlocus(transcript) sup.session = session sup_orfs = asyncio.run(sup.get_orfs([query.query_id])) self.assertEqual(len(sup_orfs), 1) self.assertIn(transcript.id, sup_orfs) self.assertEqual(len(sup_orfs[transcript.id]), 1) self.assertIsInstance(sup_orfs[transcript.id][0], BED12, type(sup_orfs[transcript.id][0])) self.assertTrue( sup_orfs[transcript.id][0] == orf, "\n" + "\n".join([str(orf), str(sup_orfs[transcript.id][0])]))
def setUp(self): gff_transcript1 = """Chr1\tfoo\ttranscript\t101\t300\t.\t+\t.\tID=t0 Chr1\tfoo\texon\t101\t300\t.\t+\t.\tID=t0:exon1;Parent=t0 Chr1\tfoo\tCDS\t101\t250\t.\t+\t.\tID=t0:exon1;Parent=t0""".split("\n") gff_transcript1 = [GFF.GffLine(x) for x in gff_transcript1] self.assertEqual(gff_transcript1[0].chrom, "Chr1", gff_transcript1[0]) self.transcript1 = Transcript(gff_transcript1[0]) for exon in gff_transcript1[1:]: self.transcript1.add_exon(exon) self.transcript1.finalize() self.assertTrue(self.transcript1.monoexonic) self.assertEqual(self.transcript1.chrom, gff_transcript1[0].chrom) gff_transcript2 = """Chr1\tfoo\ttranscript\t101\t600\t.\t+\t.\tID=t1 Chr1\tfoo\texon\t101\t200\t.\t+\t.\tID=t1:exon1;Parent=t1 Chr1\tfoo\texon\t301\t400\t.\t+\t.\tID=t1:exon2;Parent=t1 Chr1\tfoo\texon\t501\t600\t.\t+\t.\tID=t1:exon3;Parent=t1""".split("\n") gff_transcript2 = [GFF.GffLine(x) for x in gff_transcript2] self.transcript2 = Transcript(gff_transcript2[0], logger=self.logger) for exon in gff_transcript2[1:-1]: self.transcript2.add_exon(exon) # Test that a transcript cannot be finalized if # the exons do not define the external boundaries with self.assertLogs("null", level="WARNING") as _: self.transcript2.finalize() with self.assertRaises(exceptions.ModificationError): self.transcript2.add_exon(gff_transcript2[-1]) self.transcript2.finalized = False self.transcript2.start = 101 self.transcript2.end = 600 self.transcript2.add_exon(gff_transcript2[-1]) self.transcript2.finalize() self.assertFalse(self.transcript2.monoexonic) self.assertEqual(self.transcript2.exon_num, len(gff_transcript2) - 1) # Test that trying to modify a transcript after it has been finalized causes errors with self.assertRaises(exceptions.ModificationError): for exon in gff_transcript2[1:]: self.transcript2.add_exon(exon) # Test that creating a superlocus without configuration fails with self.assertRaises(exceptions.NoJsonConfigError): _ = Superlocus(self.transcript1) self.my_json = os.path.join(os.path.dirname(__file__), "configuration.yaml") self.my_json = configurator.to_json(self.my_json) self.assertIn("scoring", self.my_json, self.my_json.keys())
def test_get_external(self): checked_conf = load_and_validate_config(None).copy() checked_conf.pick.output_format.report_all_external_metrics = True transcript = Transcript() transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema( ).load({ "rescaling": "max", "default": 0, "rtype": "float", 'multiplier': 4, 'use_raw': True, 'percentage': True }) transcript.attributes["tpm"] = 10 int_source = ExternalSource('int', 'int', 0) float_source = ExternalSource('float', 'float', 0) bool_source = ExternalSource('bool', 'bool', 0) raw_int_source = ExternalSource('raw_int', 'int', 1) raw_float_source = ExternalSource('raw_float', 'float', 1) raw_bool_source = ExternalSource('raw_bool', 'bool', 1) int_score = External(1, 1, 10) float_score = External(1, 2, 10.0) bool_score = External( 1, 3, int(False) ) # We cast as int here following external.py serialize function raw_int_score = External(1, 4, 8) raw_float_score = External(1, 5, 8.0) raw_bool_score = External( 1, 6, int(True) ) # We cast as int here following external.py serialize function query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() session.add_all([ int_source, float_source, bool_source, raw_int_source, raw_float_source, raw_bool_source ]) session.add_all([query, query2]) session.add_all([ int_score, float_score, bool_score, raw_int_score, raw_float_score, raw_bool_score ]) session.commit() sup = Superlocus(transcript, configuration=checked_conf) sup.session = session tid = transcript.id self.assertIn(tid, sup.transcripts) from collections import namedtuple qobj = {1: namedtuple('t', field_names=('query_name'))} qobj[1].query_name = 'ENST00000560636' external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'float': (10.0, False), 'bool': (False, False), 'raw_int': (8, True), 'raw_float': (8.0, True), 'raw_bool': (True, True) } }) sup.configuration.pick.output_format.report_all_external_metrics = False external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual(len(external), 0) # These are meaningless it's just to verify we are loading *only* these metrics. # We should *NOT* have 'float' as it is not present in any section. sup.configuration.scoring.scoring["external.int"] = MinMaxScore( rescaling="max", filter=None) sup.configuration.scoring.requirements.parameters[ "external.raw_float"] = SizeFilter(operator="gt", value=100) sup.configuration.scoring.cds_requirements.parameters[ "external.raw_int"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.as_requirements.parameters[ "external.raw_bool"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.not_fragmentary.parameters[ "external.bool"] = SizeFilter(operator="ne", value=False) external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'raw_float': (8.0, True), 'bool': (False, False), 'raw_int': (8, True), 'raw_bool': (True, True) } })
def test_retrieval(self): engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() transcript = Transcript(accept_undefined_multi=True) transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" chrom_one = Chrom("1", 10**8) chrom_fifteen = Chrom("15", 5 * 10**8) session.add_all([chrom_one, chrom_fifteen]) session.commit() # junction_start, junction_end, name, strand, score, chrom_id) # This junction is on a different chrom junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one", "+", 10, chrom_one.chrom_id) # This junction is too far away outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1, "chrom_15_outside", "+", 10, chrom_fifteen.chrom_id) # This junction is in the right place but wrong strand wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1, "chrom_15_wrong_strand", "-", 10, chrom_fifteen.chrom_id) # This one is correct chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15", "+", 10, chrom_fifteen.chrom_id) session.add_all([ junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]) session.commit() self.assertEqual(junction_chrom_one.chrom, "1") for junc in [ outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]: self.assertEqual(junc.chrom, "15") for strand, stranded in itertools.product(("+", "-", None), (True, False)): transcript.unfinalize() transcript.strand = strand transcript.finalize() sup = Superlocus(transcript, stranded=stranded) self.assertTrue( (chrom_15_junction.junction_start, chrom_15_junction.end) in sup.introns, (chrom_15_junction, sup.introns)) sup.session = session asyncio.run(sup._load_introns()) if stranded is True and strand is not None: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, strand)}, (stranded, strand)) elif stranded is False: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, chrom_15_junction.strand), (wrong_strand_chrom_15.junction_start, wrong_strand_chrom_15.junction_end, wrong_strand_chrom_15.strand)}, (stranded, strand)) elif stranded is True and strand is None: self.assertEqual(sup.locus_verified_introns, set())
def test_locus(self): """Basic testing of the Locus functionality.""" logger = create_null_logger("null") logger.setLevel("WARNING") logger.info("Started") slocus = Superlocus(self.transcript1, json_conf=self.my_json, logger=logger) slocus.add_transcript_to_locus(self.transcript2) self.assertEqual(slocus.strand, self.transcript1.strand) self.assertEqual(slocus.start, min(self.transcript1.start, self.transcript2.start)) self.assertEqual(slocus.end, max(self.transcript1.end, self.transcript2.end)) logger.info(slocus.transcripts) slocus.define_subloci() logger.info(slocus.subloci) logger.info(slocus.transcripts) self.assertEqual(len(slocus.transcripts), 2) self.assertEqual(len(slocus.subloci), 2) slocus.define_monosubloci() self.assertEqual(len(slocus.monosubloci), 2) slocus.define_loci() self.assertEqual(len(slocus.loci), 1) self.assertEqual( list(slocus.loci[list( slocus.loci.keys())[0]].transcripts.keys())[0], "t0") gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0 Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split( "\n") gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3] transcript3 = Transcript(gff_transcript3[0]) for exon in gff_transcript3[1:]: transcript3.add_exon(exon) transcript3.finalize() minusuperlocus = Superlocus(transcript3, json_conf=self.my_json) minusuperlocus.define_loci() self.assertEqual(len(minusuperlocus.loci), 1) self.assertTrue(transcript3.strand != self.transcript1.strand)