def test_retrieval(self): engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() transcript = Transcript(accept_undefined_multi=True) transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" chrom_one = Chrom("1", 10**8) chrom_fifteen = Chrom("15", 5 * 10**8) session.add_all([chrom_one, chrom_fifteen]) session.commit() # junction_start, junction_end, name, strand, score, chrom_id) # This junction is on a different chrom junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one", "+", 10, chrom_one.chrom_id) # This junction is too far away outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1, "chrom_15_outside", "+", 10, chrom_fifteen.chrom_id) # This junction is in the right place but wrong strand wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1, "chrom_15_wrong_strand", "-", 10, chrom_fifteen.chrom_id) # This one is correct chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15", "+", 10, chrom_fifteen.chrom_id) session.add_all([ junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]) session.commit() self.assertEqual(junction_chrom_one.chrom, "1") for junc in [ outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]: self.assertEqual(junc.chrom, "15") for strand, stranded in itertools.product(("+", "-", None), (True, False)): transcript.unfinalize() transcript.strand = strand transcript.finalize() sup = Superlocus(transcript, stranded=stranded) self.assertTrue( (chrom_15_junction.junction_start, chrom_15_junction.end) in sup.introns, (chrom_15_junction, sup.introns)) sup.session = session asyncio.run(sup._load_introns()) if stranded is True and strand is not None: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, strand)}, (stranded, strand)) elif stranded is False: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, chrom_15_junction.strand), (wrong_strand_chrom_15.junction_start, wrong_strand_chrom_15.junction_end, wrong_strand_chrom_15.strand)}, (stranded, strand)) elif stranded is True and strand is None: self.assertEqual(sup.locus_verified_introns, set())
def test_get_external(self): checked_conf = load_and_validate_config(None).copy() checked_conf.pick.output_format.report_all_external_metrics = True transcript = Transcript() transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema( ).load({ "rescaling": "max", "default": 0, "rtype": "float", 'multiplier': 4, 'use_raw': True, 'percentage': True }) transcript.attributes["tpm"] = 10 int_source = ExternalSource('int', 'int', 0) float_source = ExternalSource('float', 'float', 0) bool_source = ExternalSource('bool', 'bool', 0) raw_int_source = ExternalSource('raw_int', 'int', 1) raw_float_source = ExternalSource('raw_float', 'float', 1) raw_bool_source = ExternalSource('raw_bool', 'bool', 1) int_score = External(1, 1, 10) float_score = External(1, 2, 10.0) bool_score = External( 1, 3, int(False) ) # We cast as int here following external.py serialize function raw_int_score = External(1, 4, 8) raw_float_score = External(1, 5, 8.0) raw_bool_score = External( 1, 6, int(True) ) # We cast as int here following external.py serialize function query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() session.add_all([ int_source, float_source, bool_source, raw_int_source, raw_float_source, raw_bool_source ]) session.add_all([query, query2]) session.add_all([ int_score, float_score, bool_score, raw_int_score, raw_float_score, raw_bool_score ]) session.commit() sup = Superlocus(transcript, configuration=checked_conf) sup.session = session tid = transcript.id self.assertIn(tid, sup.transcripts) from collections import namedtuple qobj = {1: namedtuple('t', field_names=('query_name'))} qobj[1].query_name = 'ENST00000560636' external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'float': (10.0, False), 'bool': (False, False), 'raw_int': (8, True), 'raw_float': (8.0, True), 'raw_bool': (True, True) } }) sup.configuration.pick.output_format.report_all_external_metrics = False external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual(len(external), 0) # These are meaningless it's just to verify we are loading *only* these metrics. # We should *NOT* have 'float' as it is not present in any section. sup.configuration.scoring.scoring["external.int"] = MinMaxScore( rescaling="max", filter=None) sup.configuration.scoring.requirements.parameters[ "external.raw_float"] = SizeFilter(operator="gt", value=100) sup.configuration.scoring.cds_requirements.parameters[ "external.raw_int"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.as_requirements.parameters[ "external.raw_bool"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.not_fragmentary.parameters[ "external.bool"] = SizeFilter(operator="ne", value=False) external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'raw_float': (8.0, True), 'bool': (False, False), 'raw_int': (8, True), 'raw_bool': (True, True) } })
def create_transcript(tid: str, parent: str, lines: List[GtfLine], args: argparse.Namespace): """""" chroms = defaultdict(list) for line in lines: chroms[line.chrom].append(line) if len(chroms) > 1: # Recursively for chrom in chroms: newtid = tid + "." + chrom newparent = parent + "." + chrom for transcript in create_transcript(newtid, newparent, chroms[chrom], args): assert transcript.id == newtid, (newtid, transcript.id) assert transcript.parent[0] == newparent yield transcript else: # Now we are sure that we only have one chromosome exons = sorted([line for line in lines if line.is_exon], key=operator.attrgetter("chrom", "start", "end")) if len(exons) == 1: transcript = Transcript(exons[0]) transcript.id = tid transcript.parent = parent transcript.finalize() yield transcript else: new_exons = deque() identifier = ord("A") - 1 current = exons[0] for exon in exons[1:]: if ((overlap((exon.start, exon.end), (current.start, current.end)) > 0) or (exon.start - current.end + 1 <= args.min_intron and args.split is False)): # Merge the two exons current.end = exon.end elif ((exon.start - current.end + 1 <= args.min_intron and args.split is True) or exon.start - current.end + 1 > args.max_intron): # TODO: split new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) transcript.finalize() identifier += 1 transcript.parent = parent + "." + chr(identifier) transcript.id = tid + "." + chr(identifier) yield transcript current = exon new_exons = deque() else: new_exons.append(current) current = exon new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) if identifier == ord("A") - 1: transcript.id = tid transcript.parent = parent else: identifier += 1 transcript.id = tid + "." + chr(identifier) transcript.parent = parent + "." + chr(identifier) transcript.finalize() yield transcript