def setUp(self): # Forward self.fwd = { "exon_1": Exon(10, 30, "+", "chr1"), "exon_2": Exon(40, 70, "+", "chr1"), "exon_3": Exon(80, 100, "+", "chr1"), "intron_1": Intron(31, 39, "+", "chr1"), "intron_2": Intron(71, 79, "+", "chr1") } self.fwd["transcript"] = Transcript(10, 100, "+", "chr1", children=[ self.fwd["exon_1"], self.fwd["exon_3"], self.fwd["exon_2"] ]) # Reverse self.rvs = { "exon_1": Exon(10, 30, "-", "chr1"), "exon_2": Exon(40, 70, "-", "chr1"), "exon_3": Exon(80, 100, "-", "chr1"), "intron_1": Intron(31, 39, "-", "chr1"), "intron_2": Intron(71, 79, "-", "chr1") } self.rvs["transcript"] = Transcript(reference="chr1", children=[ self.rvs["exon_1"], self.rvs["exon_3"], self.rvs["exon_2"] ])
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_fasta_idx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_fasta = os.path.join(tmp_folder, unique_id + ".fasta") # Create sequence file content_fasta = """>one ATGCATGCATGCATGCATGCATGCATGCAT GCATGCATGCATGCATGCATGCATGCATGC ATGCAT >two another chromosome ATGCATGCATGCAT GCATGCATGCATGC""" with open(self.tmp_fasta, "w") as FH_out: FH_out.write(content_fasta) # Proteins # 1 3 5 7 9 11 14 16 19 21 30 33 36 39 41 44 47 50 # ATGCATGCAT GCATG CATGC ATGCATGCA TGCATGCATGC ATGCATGCATGCATGCATGC # ..... ***** *******.... ...... # prot_1 12345 6789 11 tr_1 = Transcript(None, None, "+", "one", children=[ Exon(10, 20, "+", "one"), Exon(30, 40, "+", "one"), Exon(45, 50, "+", "one") ]) self.prot_1 = Protein(16, 36, "+", "one", transcript=tr_1) tr_2 = Transcript(None, None, "-", "one", children=[ Exon(10, 20, "-", "one"), Exon(30, 40, "-", "one"), Exon(45, 50, "-", "one") ]) self.prot_2 = Protein(16, 36, "-", "one", transcript=tr_2) # Create index content_fasta_idx = """one 66 5 30 31 two 28 98 14 15""" with open(self.tmp_fasta_idx, "w") as FH_out: FH_out.write(content_fasta_idx)
def testSetProteins(self): # By init protein_1 = Protein(10, 30, "+", "chr1", "p1") protein_2 = Protein(32, 50, "+", "chr1", "p2") transcript_1 = Transcript(name="tr1", proteins=[protein_1, protein_2]) self.assertEqual([prot.name for prot in transcript_1.proteins], [protein_1.name, protein_2.name]) self.assertEqual( [protein_1.transcript.name, protein_2.transcript.name], [transcript_1.name, transcript_1.name]) # By method protein_1 = Protein(10, 30, "+", "chr1", "p1") protein_2 = Protein(32, 50, "+", "chr1", "p2") transcript_1 = Transcript(name="tr1") transcript_1.proteins = [protein_1, protein_2] self.assertEqual([prot.name for prot in transcript_1.proteins], [protein_1.name, protein_2.name]) self.assertEqual( [protein_1.transcript.name, protein_2.transcript.name], [transcript_1.name, transcript_1.name]) # Replace proteins protein_1 = Protein(10, 30, "+", "chr1", "p1") protein_2 = Protein(32, 50, "+", "chr1", "p2") protein_3 = Protein(54, 70, "+", "chr1", "p3") transcript_1 = Transcript(name="tr1", proteins=[protein_1, protein_3]) transcript_1.proteins = [protein_2, protein_3] self.assertIsNone(protein_1.transcript) self.assertNotIn(protein_1, transcript_1.proteins) self.assertEqual(protein_2.transcript, transcript_1) self.assertIn(protein_2, transcript_1.proteins) self.assertEqual(protein_3.transcript, transcript_1) self.assertIn(protein_3, transcript_1.proteins)
def testGetPosOnRegion(self): # Forward self.assertEqual(self.fwd["transcript"].getPosOnRegion(10), 1) self.assertEqual(self.fwd["transcript"].getPosOnRegion(20), 11) self.assertEqual(self.fwd["transcript"].getPosOnRegion(30), 21) self.assertEqual(self.fwd["transcript"].getPosOnRegion(40), 22) self.assertEqual(self.fwd["transcript"].getPosOnRegion(80), 53) # Reverse self.assertEqual(self.rvs["transcript"].getPosOnRegion(100), 1) self.assertEqual(self.rvs["transcript"].getPosOnRegion(90), 11) self.assertEqual(self.rvs["transcript"].getPosOnRegion(80), 21) self.assertEqual(self.rvs["transcript"].getPosOnRegion(70), 22) self.assertEqual(self.rvs["transcript"].getPosOnRegion(30), 53) # Exception out of transcript with self.assertRaises(Exception): self.fwd["transcript"].getPosOnRegion(8) # Exception in intron with self.assertRaises(Exception): self.fwd["transcript"].getPosOnRegion(31) # Exception strand unstranded_tr = Transcript(10, 30, None, "chr1") with self.assertRaises(Exception): unstranded_tr.getPosOnRegion(20)
def addProtein(self): protein_1 = Protein(10, 30, "+", "chr1", "p1") protein_2 = Protein(32, 50, "+", "chr1", "p2") transcript_1 = Transcript(name="tr1") # Empty self.assertEqual([elt.name for elt in transcript_1.proteins], []) # Add protein_1 transcript_1.addProtein(protein_1) self.assertEqual([elt.name for elt in transcript_1.proteins], ["p1"]) # Check from tr self.assertEqual(protein_1.transcript, transcript_1) # Check from prot # Add protein_2 transcript_1.addProtein(protein_2) self.assertEqual([elt.name for elt in transcript_1.proteins], ["p1", "p2"]) # Check from tr self.assertEqual(protein_1.transcript, transcript_1) # Check from prot self.assertEqual(protein_2.transcript, transcript_1) # Check from prot
def getTranscriptAnnot(in_annot, gene_by_tr): """ Get genomic model (genes, transcripts and exons) for the selected transcripts. :param in_annot: Path to the genomic annotations (format: GFF3). :type in_annot: str :param gene_by_tr: Gene by selected transcripts. :type gene_by_tr: dict :return: The list of selected transcripts. :rtype: anacore.region.RegionList """ tr_by_id = dict() with GFF3IO(in_annot) as FH_annot: for record in FH_annot: if record.type == "mRNA" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel if tr_id not in tr_by_id: tr_by_id[tr_id] = Transcript(record.start, record.end, record.strand, record.seq_id, tr_id, {}, gene_by_tr[tr_id]) if record.type == "exon" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel # Store the exon tr_by_id[tr_id].addChild( Exon(record.start, record.end, record.strand, record.seq_id)) if len(gene_by_tr) != len(tr_by_id): raise Exception( "The following transcripts are missing in {}: {}".format( args.input_annotation, set(gene_by_tr.keys()).difference(set(tr_by_id.keys())))) return RegionList(tr_by_id.values())
def delProtein(self): protein_1 = Protein(10, 30, "+", "chr1", "p1") protein_2 = Protein(32, 50, "+", "chr1", "p2") transcript_1 = Transcript(name="tr1", proteins=[protein_1, protein_2]) # Init self.assertEqual([elt.name for elt in transcript_1.proteins], ["p1", "p2"]) # Check from tr self.assertEqual(protein_1.transcript, transcript_1) # Check from prot self.assertEqual(protein_2.transcript, transcript_1) # Check from prot # Delete protein 2 transcript_1.delProtein(protein_2) self.assertEqual([elt.name for elt in transcript_1.proteins], ["p1"]) # Check from tr self.assertEqual(protein_1.transcript, transcript_1) # Check from prot self.assertEqual(protein_2.transcript, None) # Check from prot # Delete protein 1 transcript_1.addProtein(protein_1) self.assertEqual([elt.name for elt in transcript_1.proteins], []) # Check from tr self.assertEqual(protein_1.transcript, None) # Check from prot self.assertEqual(protein_2.transcript, None) # Check from prot
def testGetTranscriptsAnnot_withoutUTR_oneExon(self): exon_1 = Exon(91, 150, "+", "chr1", "exon_2") cds_1 = CDS(91, 150, "+", "chr1", "cds_1") gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"}) transcrit_1 = Transcript(None, None, None, "chr1", "transcrit_1", {"id": "tr_1"}, parent=gene_1, children=[exon_1]) protein_1 = Protein(None, None, None, "chr1", "protein_2", children=[cds_1], transcript=transcrit_1) queries = [ Region(80, 160, None, "chr1", "query_1", {"desc": "starts before exon_1 ; ends after exon_1."}), Region( 91, 150, None, "chr1", "query_2", {"desc": "starts at start of exon_1 ; ends at end of exon_1." }), Region(100, 110, None, "chr1", "query_3", {"desc": "starts in exon_1 ; ends in exon_1."}), Region(80, 100, None, "chr1", "query_4", {"desc": "starts before exon_1 ; ends in exon_1."}), Region(110, 200, None, "chr1", "query_5", {"desc": "starts in exon_1 ; ends after exon_1."}), ] # Expected forward 1 exon expected = { "query_1": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_2": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_3": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 4, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 7 }, "query_4": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 4 }, "query_5": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 7, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "1" }.items(): query_res[key] = val # Apply forward strand for exon in transcrit_1.children: exon.strand = "+" for cds in protein_1.children: cds.strand = "+" transcrit_1.sortChildren() protein_1.sortChildren() # Asert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations) # Expected reverse 1 exon expected = { "query_1": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_2": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_3": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 14, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 17 }, "query_4": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 17, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_5": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 14 }, } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "-1" }.items(): query_res[key] = val # Apply reverse strand for exon in transcrit_1.children: exon.strand = "-" for cds in protein_1.children: cds.strand = "-" transcrit_1.sortChildren() protein_1.sortChildren() # Asert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations)
def testGetTranscriptsAnnot_withoutUTR_threeExons(self): exon_1 = Exon(10, 40, "+", "chr1", "fwd_exon_1") exon_2 = Exon(91, 150, "+", "chr1", "fwd_exon_2") exon_3 = Exon(201, 361, "+", "chr1", "fwd_exon_3") cds_1 = CDS(10, 40, "+", "chr1", "fwd_cds_1") cds_2 = CDS(91, 150, "+", "chr1", "fwd_cds_2") cds_3 = CDS(201, 361, "+", "chr1", "fwd_cds_3") gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"}) transcrit_1 = Transcript(None, None, None, "chr1", "transcrit_1", {"id": "tr_1"}, parent=gene_1, children=[exon_1, exon_2, exon_3]) protein_1 = Protein(None, None, None, "chr1", "protein_1", children=[cds_1, cds_2, cds_3], transcript=transcrit_1) queries = [ Region(80, 100, None, "chr1", "query_1", {"desc": "starts before exon_2 ; ends in exon_2."}), Region(100, 180, None, "chr1", "query_2", {"desc": "starts in exon_2 ; ends after exon_2."}), Region( 91, 150, None, "chr1", "query_3", { "desc": "starts at the start of exon_2 ; ends at the end of exon_2." }), Region(80, 170, None, "chr1", "query_4", {"desc": "starts before exon_2 ; ends after exon_2."}), Region(80, 230, None, "chr1", "query_5", {"desc": "starts before exon_2 ; ends in exon_3."}), Region(100, 400, None, "chr1", "query_6", {"desc": "starts in exon_2 ; ends after exon_3."}), Region(100, 250, None, "chr1", "query_7", {"desc": "starts in exon_2 ; ends in exon_3."}), Region(80, 370, None, "chr1", "query_8", {"desc": "starts before exon_2 ; ends after exon_3."}), Region(90, 151, None, "chr1", "query_9", { "desc": "starts just before exon_2 ; ends just after exon_2." }) ] # Expected forward 3 exons expected = { "query_1": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 11, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 14 }, "query_2": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 14, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 31 }, "query_3": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 11, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 31 }, "query_4": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 11, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 31 }, "query_5": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 11, "end_EXON": "3/3", "end_INTRON": None, "end_Protein_position": 41 }, "query_6": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 14, "end_EXON": "3/3", "end_INTRON": None, "end_Protein_position": 84 }, "query_7": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 14, "end_EXON": "3/3", "end_INTRON": None, "end_Protein_position": 47 }, "query_8": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 11, "end_EXON": "3/3", "end_INTRON": None, "end_Protein_position": 84 }, "query_9": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 11, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 31 }, } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "1" }.items(): query_res[key] = val # Apply forward strand for exon in transcrit_1.children: exon.strand = "+" for cds in protein_1.children: cds.strand = "+" transcrit_1.sortChildren() protein_1.sortChildren() # Assert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations) # Expected reverse 3 exons expected = { "query_1": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 71, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 74 }, "query_2": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 54, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 71 }, "query_3": { "start_EXON": "2/3", "start_INTRON": None, "start_Protein_position": 54, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 74 }, "query_4": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 54, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 74 }, "query_5": { "start_EXON": "1/3", "start_INTRON": None, "start_Protein_position": 44, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 74 }, "query_6": { "start_EXON": "1/3", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 71 }, "query_7": { "start_EXON": "1/3", "start_INTRON": None, "start_Protein_position": 38, "end_EXON": "2/3", "end_INTRON": None, "end_Protein_position": 71 }, "query_8": { "start_EXON": "1/3", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 74 }, "query_9": { "start_EXON": None, "start_INTRON": "1/2", "start_Protein_position": 54, "end_EXON": None, "end_INTRON": "2/2", "end_Protein_position": 74 } } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "-1" }.items(): query_res[key] = val # Apply reverse strand for exon in transcrit_1.children: exon.strand = "-" for cds in protein_1.children: cds.strand = "-" transcrit_1.sortChildren() protein_1.sortChildren() # Asert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations)
def testGetCodonRefPos(self): tr_1 = Transcript(None, None, "+", "chr1", children=[ Exon(10, 20, "+", "chr1"), Exon(30, 40, "+", "chr1"), Exon(45, 50, "+", "chr1") ]) prot_1 = Protein(16, 36, "+", "chr1", transcript=tr_1) tr_2 = Transcript(None, None, "-", "chr1", children=[ Exon(10, 20, "-", "chr1"), Exon(30, 40, "-", "chr1"), Exon(45, 50, "-", "chr1") ]) prot_2 = Protein(16, 36, "-", "chr1", transcript=tr_2) data = [ { "prot": prot_1, "aa_pos": 1, "expected": [16, 17, 18] }, { "prot": prot_1, "aa_pos": 2, "expected": [19, 20, 30] }, { "prot": prot_1, "aa_pos": 3, "expected": [31, 32, 33] }, { "prot": prot_1, "aa_pos": 4, "expected": [34, 35, 36] }, { "prot": prot_2, "aa_pos": 1, "expected": [36, 35, 34] }, { "prot": prot_2, "aa_pos": 2, "expected": [33, 32, 31] }, { "prot": prot_2, "aa_pos": 3, "expected": [30, 20, 19] }, { "prot": prot_2, "aa_pos": 4, "expected": [18, 17, 16] }, ] for curr in data: self.assertEqual(curr["prot"].getCodonRefPos(curr["aa_pos"]), curr["expected"]) with self.assertRaises(Exception): self.prot_1.getCodonRefPos(8) # Not in protein
def testGetCDSFromTranscript(self): res = [] # One exon forward transcript_1 = Transcript(None, None, "+", "chr1", children=[Exon(100, 150, "+", "chr1")]) res.append({ "expected": [CDS(100, 150, "+", "chr1")], "observed": Protein(100, 150, "+", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 140, "+", "chr1")], "observed": Protein(110, 140, "+", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "+", "chr1")], "observed": Protein(110, 150, "+", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 140, "+", "chr1")], "observed": Protein(100, 140, "+", "chr1", transcript=transcript_1).getCDSFromTranscript() }) # Three exons forward transcript_2 = Transcript(None, None, "+", "chr1", children=[ Exon(30, 80, "+", "chr1"), Exon(100, 150, "+", "chr1"), Exon(170, 200, "+", "chr1") ]) res.append({ "expected": [CDS(100, 150, "+", "chr1")], "observed": Protein(100, 150, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 140, "+", "chr1")], "observed": Protein(110, 140, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "+", "chr1")], "observed": Protein(110, 150, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 140, "+", "chr1")], "observed": Protein(100, 140, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(30, 80, "+", "chr1"), CDS(100, 150, "+", "chr1")], "observed": Protein(30, 150, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(60, 80, "+", "chr1"), CDS(100, 150, "+", "chr1")], "observed": Protein(60, 150, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(80, 80, "+", "chr1"), CDS(100, 150, "+", "chr1")], "observed": Protein(80, 150, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [ CDS(80, 80, "+", "chr1"), CDS(100, 150, "+", "chr1"), CDS(170, 170, "+", "chr1") ], "observed": Protein(80, 170, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 150, "+", "chr1"), CDS(170, 200, "+", "chr1")], "observed": Protein(100, 200, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 150, "+", "chr1"), CDS(170, 190, "+", "chr1")], "observed": Protein(100, 190, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "+", "chr1"), CDS(170, 200, "+", "chr1")], "observed": Protein(110, 200, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "+", "chr1"), CDS(170, 190, "+", "chr1")], "observed": Protein(110, 190, "+", "chr1", transcript=transcript_2).getCDSFromTranscript() }) # One exon reverse transcript_1 = Transcript(None, None, "-", "chr1", children=[Exon(100, 150, "-", "chr1")]) res.append({ "expected": [CDS(100, 150, "-", "chr1")], "observed": Protein(100, 150, "-", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 140, "-", "chr1")], "observed": Protein(110, 140, "-", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "-", "chr1")], "observed": Protein(110, 150, "-", "chr1", transcript=transcript_1).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 140, "-", "chr1")], "observed": Protein(100, 140, "-", "chr1", transcript=transcript_1).getCDSFromTranscript() }) # Three exons reverse transcript_2 = Transcript(None, None, "-", "chr1", children=[ Exon(170, 200, "-", "chr1"), Exon(100, 150, "-", "chr1"), Exon(30, 80, "-", "chr1") ]) res.append({ "expected": [CDS(100, 150, "-", "chr1")], "observed": Protein(100, 150, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 140, "-", "chr1")], "observed": Protein(110, 140, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(110, 150, "-", "chr1")], "observed": Protein(110, 150, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 140, "-", "chr1")], "observed": Protein(100, 140, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 150, "-", "chr1"), CDS(30, 80, "-", "chr1")], "observed": Protein(30, 150, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 150, "-", "chr1"), CDS(60, 80, "-", "chr1")], "observed": Protein(60, 150, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(100, 150, "-", "chr1"), CDS(80, 80, "-", "chr1")], "observed": Protein(80, 150, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [ CDS(170, 170, "-", "chr1"), CDS(100, 150, "-", "chr1"), CDS(80, 80, "-", "chr1") ], "observed": Protein(80, 170, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(170, 200, "-", "chr1"), CDS(100, 150, "-", "chr1")], "observed": Protein(100, 200, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(170, 190, "-", "chr1"), CDS(100, 150, "-", "chr1")], "observed": Protein(100, 190, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(170, 200, "-", "chr1"), CDS(110, 150, "-", "chr1")], "observed": Protein(110, 200, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) res.append({ "expected": [CDS(170, 190, "-", "chr1"), CDS(110, 150, "-", "chr1")], "observed": Protein(110, 190, "-", "chr1", transcript=transcript_2).getCDSFromTranscript() }) # Launch evaluation for eval_pair in res: self.assertEqual( ", ".join([ curr_cds.getCoordinatesStr() for curr_cds in eval_pair["expected"] ]), ", ".join([ curr_cds.getCoordinatesStr() for curr_cds in eval_pair["observed"] ]), )
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_ensembl_in_gtf = os.path.join(tmp_folder, unique_id + "_ensembl_in.gtf") self.tmp_ensembl_out_gtf = os.path.join(tmp_folder, unique_id + "_ensembl_out.gtf") self.tmp_ncbi_in_gtf = os.path.join(tmp_folder, unique_id + "_ncbi_in.gtf") self.tmp_ncbi_out_gtf = os.path.join(tmp_folder, unique_id + "_ncbi_out.gtf") # Ensembl GTF self.ensembl_expected = [ Gene(54770583, 54771134, "+", "6", "KRASP1", { "id": "ENSG00000220635" }, None, [ Transcript( 54770583, 54771134, "+", "6", "KRASP1-201", {"id": "ENST00000407852"}, None, [Exon(54770583, 54771134, "+", "6", "ENST00000407852_e1")]) ]), Gene( 25204789, 25250936, "-", "12", "KRAS", {"id": "ENSG00000133703"}, None, [ Transcript( 25204789, 25250931, "-", "12", "KRAS-202", {"id": "ENST00000311936"}, None, [ Exon(25250751, 25250931, "-", "12", "ENST00000311936_e1"), Exon(25245274, 25245395, "-", "12", "ENST00000311936_e2"), Exon(25227234, 25227412, "-", "12", "ENST00000311936_e3"), Exon(25225614, 25225773, "-", "12", "ENST00000311936_e5"), Exon(25204789, 25209911, "-", "12", "ENST00000311936_e6") ], [ Protein(25209798, 25245384, "-", "12", None, None, None, [ CDS(25245274, 25245384, "-", "12", ""), CDS(25227234, 25227412, "-", "12", ""), CDS(25225614, 25225773, "-", "12", ""), CDS(25209798, 25209911, "-", "12", "") ]) ]), Transcript( 25209168, 25250936, "-", "12", "KRAS-204", {"id": "ENST00000557334"}, None, [ Exon(25250751, 25250936, "-", "12", "ENST00000557334_e1"), Exon(25245274, 25245395, "-", "12", "ENST00000557334_e2"), Exon(25209168, 25209911, "-", "12", "ENST00000557334_e3") ], [ Protein(25209798, 25245384, "-", "12", None, None, None, [ CDS(25245274, 25245384, "-", "12", ""), CDS(25209798, 25209911, "-", "12", "") ]) ]), Transcript( 25209431, 25250803, "-", "12", "KRAS-201", {"id": "ENST00000256078"}, None, [ Exon(25250751, 25250803, "-", "12", "ENST00000256078_e1"), Exon(25245274, 25245395, "-", "12", "ENST00000256078_e2"), Exon(25227234, 25227412, "-", "12", "ENST00000256078_e3"), Exon(25225614, 25225773, "-", "12", "ENST00000256078_e4"), Exon(25215437, 25215560, "-", "12", "ENST00000256078_e5"), Exon(25209431, 25209911, "-", "12", "ENST00000256078_e6") ], [ Protein(25215444, 25245384, "-", "12", None, None, None, [ CDS(25245274, 25245384, "-", "12", ""), CDS(25227234, 25227412, "-", "12", ""), CDS(25225614, 25225773, "-", "12", ""), CDS(25215444, 25215560, "-", "12", "") ]) ]), Transcript( 25233819, 25250929, "-", "12", "KRAS-203", {"id": "ENST00000556131"}, None, [ Exon(25250764, 25250929, "-", "12", "ENST00000556131_e1"), Exon(25245274, 25245395, "-", "12", "ENST00000556131_e2"), Exon(25233819, 25235226, "-", "12", "ENST00000556131_e3") ], [ Protein(25235209, 25245384, "-", "12", None, None, None, [ CDS(25245274, 25245384, "-", "12", ""), CDS(25235209, 25235226, "-", "12", "") ]) ]) ]) ] with open(self.tmp_ensembl_in_gtf, "w") as FH_gtf: FH_gtf.write("""#!genome-build GRCh38.p12 #!genome-version GRCh38 #!genome-date 2013-12 #!genome-build-accession NCBI:GCA_000001405.27 #!genebuild-last-updated 2018-07 6 havana gene 54770583 54771134 . + . gene_id \"ENSG00000220635\"; gene_version \"2\"; gene_name \"KRASP1\"; gene_source \"havana\"; gene_biotype \"processed_pseudogene\"; 6 havana transcript 54770583 54771134 . + . gene_id \"ENSG00000220635\"; gene_version \"2\"; transcript_id \"ENST00000407852\"; transcript_version \"2\"; gene_name \"KRASP1\"; gene_source \"havana\"; gene_biotype \"processed_pseudogene\"; transcript_name \"KRASP1-201\"; transcript_source \"havana\"; transcript_biotype \"processed_pseudogene\"; tag \"basic\"; transcript_support_level \"NA\"; 6 havana exon 54770583 54771134 . + . gene_id \"ENSG00000220635\"; gene_version \"2\"; transcript_id \"ENST00000407852\"; transcript_version \"2\"; exon_number \"1\"; gene_name \"KRASP1\"; gene_source \"havana\"; gene_biotype \"processed_pseudogene\"; transcript_name \"KRASP1-201\"; transcript_source \"havana\"; transcript_biotype \"processed_pseudogene\"; exon_id \"ENSE00001550689\"; exon_version \"2\"; tag \"basic\"; transcript_support_level \"NA\"; 12 ensembl_havana gene 25204789 25250936 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; 12 ensembl_havana transcript 25204789 25250931 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25250751 25250931 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; exon_id \"ENSE00001189804\"; exon_version \"4\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25245274 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; exon_id \"ENSE00000936617\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; protein_id \"ENSP00000308495\"; protein_version \"3\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25227234 25227412 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; exon_id \"ENSE00001719809\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; protein_id \"ENSP00000308495\"; protein_version \"3\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25225614 25225773 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"4\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; exon_id \"ENSE00001644818\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"4\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; protein_id \"ENSP00000308495\"; protein_version \"3\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25204789 25209911 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; exon_id \"ENSE00002456976\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25209798 25209911 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; protein_id \"ENSP00000308495\"; protein_version \"3\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana stop_codon 25209795 25209797 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana five_prime_utr 25250751 25250931 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana three_prime_utr 25204789 25209794 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000311936\"; transcript_version \"7\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-202\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8702\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana transcript 25209168 25250936 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana exon 25250751 25250936 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00002446502\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana exon 25245274 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00000936617\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana CDS 25245274 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000452512\"; protein_version \"1\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana start_codon 25245382 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana exon 25209168 25209911 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00002464674\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana CDS 25209798 25209911 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000452512\"; protein_version \"1\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana stop_codon 25209795 25209797 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana five_prime_utr 25250751 25250936 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana five_prime_utr 25245385 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 havana three_prime_utr 25209168 25209794 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000557334\"; transcript_version \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-204\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"5\"; 12 ensembl_havana transcript 25209431 25250803 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25250751 25250803 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00002513959\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25245274 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00000936617\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25245274 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; protein_id \"ENSP00000256078\"; protein_version \"4\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana start_codon 25245382 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25227234 25227412 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00001719809\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25227234 25227412 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; protein_id \"ENSP00000256078\"; protein_version \"4\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25225614 25225773 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"4\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00001644818\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25225614 25225773 . - 1 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"4\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; protein_id \"ENSP00000256078\"; protein_version \"4\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25215437 25215560 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00001189807\"; exon_version \"5\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana CDS 25215444 25215560 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; protein_id \"ENSP00000256078\"; protein_version \"4\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana stop_codon 25215441 25215443 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"5\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana exon 25209431 25209911 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; exon_number \"6\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; exon_id \"ENSE00002477035\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana five_prime_utr 25250751 25250803 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana five_prime_utr 25245385 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana three_prime_utr 25215437 25215440 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 ensembl_havana three_prime_utr 25209431 25209911 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000256078\"; transcript_version \"8\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS8703\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana transcript 25233819 25250929 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana exon 25250764 25250929 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00002530521\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana exon 25245274 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00000936617\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana CDS 25245274 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000451856\"; protein_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana start_codon 25245382 25245384 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"2\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana exon 25233819 25235226 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; exon_id \"ENSE00002478081\"; exon_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana CDS 25235209 25235226 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000451856\"; protein_version \"1\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana stop_codon 25235206 25235208 . - 0 gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; exon_number \"3\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana five_prime_utr 25250764 25250929 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic\"; transcript_support_level \"1\"; 12 havana five_prime_utr 25245385 25245395 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic; other\"; transcript_support_level \"1\"; 12 havana three_prime_utr 25233819 25235205 . - . gene_id \"ENSG00000133703\"; gene_version \"11\"; transcript_id \"ENST00000556131\"; transcript_version \"1\"; gene_name \"KRAS\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"KRAS-203\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; tag \"basic;\"; transcript_support_level \"1\";""" ) # NCBI GTF self.ncbi_expected = [ Gene(54635272, 54640529, "+", "6", "KRASP1", {"id": "3844"}, None, [ Transcript( 54635272, 54640529, "+", "6", "gene14201", {"id": "gene14201"}, None, [Exon(54635272, 54640529, "+", "6", "gene14201_e1")]) ]), Gene(25357723, 25403865, "-", "12", "KRAS", {"id": "3845"}, None, [ Transcript( 25357723, 25403865, "-", "12", "rna36549", {"id": "rna36549"}, None, [ Exon(25357723, 25362845, "-", "12", "rna36549_e1"), Exon(25378548, 25378707, "-", "12", "rna36549_e2"), Exon(25380168, 25380346, "-", "12", "rna36549_e3"), Exon(25398208, 25398329, "-", "12", "rna36549_e5"), Exon(25403685, 25403865, "-", "12", "rna36549_e6") ], [ Protein(25362729, 25398318, "-", "12", None, None, None, [ CDS(25362729, 25362845, "-", "12", ""), CDS(25378548, 25378707, "-", "12", ""), CDS(25380168, 25380346, "-", "12", ""), CDS(25398208, 25398318, "-", "12", "") ]) ]), Transcript(25357723, 25403865, "-", "12", "rna36550", { "id": "rna36550" }, None, [ Exon(25357723, 25362845, "-", "12", "rna36550_e1"), Exon(25368371, 25368494, "-", "12", "rna36550_e2"), Exon(25378548, 25378707, "-", "12", "rna36550_e3"), Exon(25380168, 25380346, "-", "12", "rna36550_e4"), Exon(25398208, 25398329, "-", "12", "rna36550_e5"), Exon(25403685, 25403865, "-", "12", "rna36550_e6") ], [ Protein(25368375, 25398318, "-", "12", None, None, None, [ CDS(25368375, 25368494, "-", "12", ""), CDS(25378548, 25378707, "-", "12", ""), CDS(25380168, 25380346, "-", "12", ""), CDS(25398208, 25398318, "-", "12", ""), ]) ]) ]), Gene(2527306, 2529079, "+", "X", "CD99P1", {"id": "401577"}, None, [ Transcript( 2527306, 2529079, "+", "X", "rna58916", {"id": "rna58916"}, None, [ Exon(2527306, 2527522, "+", "X", "rna58916_e1"), Exon(2529037, 2529079, "+", "X", "rna58916_e2") ]) ]), Gene(2477306, 2479079, "+", "Y", "CD99P1", {"id": "401577"}, None, [ Transcript( 2477306, 2479079, "+", "Y", "rna61353", {"id": "rna61353"}, None, [ Exon(2477306, 2477522, "+", "Y", "rna61353_e1"), Exon(2479037, 2479079, "+", "Y", "rna61353_e2") ]) ]) ] with open(self.tmp_ncbi_in_gtf, "w") as FH_gtf: FH_gtf.write( """6 Curated Genomic exon 54635272 54640529 . + . transcript_id \"gene14201\"; gene_id \"3844\"; gene_name \"KRASP1\"; 12 BestRefSeq exon 25357723 25362845 . - . transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25378548 25378707 . - . transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25380168 25380346 . - . transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25398208 25398329 . - . transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25403685 25403865 . - . transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25362729 25362845 . - 0 transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25378548 25378707 . - 1 transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25380168 25380346 . - 0 transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25398208 25398318 . - 0 transcript_id \"rna36549\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25357723 25362845 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25368371 25368494 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25378548 25378707 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25380168 25380346 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25398208 25398329 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq exon 25403685 25403865 . - . transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25368375 25368494 . - 0 transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25378548 25378707 . - 1 transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25380168 25380346 . - 0 transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; 12 BestRefSeq CDS 25398208 25398318 . - 0 transcript_id \"rna36550\"; gene_id \"3845\"; gene_name \"KRAS\"; X BestRefSeq exon 2527306 2527522 . + . transcript_id \"rna58916\"; gene_id \"401577\"; gene_name \"CD99P1\"; X BestRefSeq exon 2529037 2529079 . + . transcript_id \"rna58916\"; gene_id \"401577\"; gene_name \"CD99P1\"; Y BestRefSeq exon 2477306 2477522 . + . transcript_id \"rna61353\"; gene_id \"401577\"; gene_name \"CD99P1\"; Y BestRefSeq exon 2479037 2479079 . + . transcript_id \"rna61353\"; gene_id \"401577\"; gene_name \"CD99P1\";""" )