def construct_gene_object(ensembl, transcript_id): """ creates an Transcript object for a gene from ensembl databases Args: ensembl: EnsemblRequest object to request data from ensembl transcript_id: string for an Ensembl transcript ID Returns: a Transcript object, containing transcript coordinates and gene and transcript sequence. Raises: ValueError if CDS from genomic sequence given gene coordinates and CDS retrieved from Ensembl do not match. """ # get the sequence for the identified transcript (chrom, start, end, strand, genomic_sequence) = ensembl.get_genomic_seq_for_transcript(transcript_id, expand=10) cds_sequence = ensembl.get_cds_seq_for_transcript(transcript_id) # get the locations of the exons and cds from ensembl cds_ranges = ensembl.get_cds_ranges_for_transcript(transcript_id) exon_ranges = ensembl.get_exon_ranges_for_transcript(transcript_id) # start a Transcript object with the locations and sequence transcript = Transcript(transcript_id, chrom, start, end, strand) transcript.set_exons(exon_ranges, cds_ranges) transcript.set_cds(cds_ranges) transcript.add_cds_sequence(cds_sequence) transcript.add_genomic_sequence(genomic_sequence, offset=10) return transcript
def test_set_cds(self): """ test that set_cds() works correctly """ exons = [(0, 200), (800, 1000)] cds = [(100, 200), (800, 900)] # make sure we raise an error if we try to set the CDS before the exons with self.assertRaises(ValueError): tx = Transcript('test', '1', 0, 1000, '+') tx.set_cds(cds) # check CDS positions self.gene.set_exons(exons, cds) self.gene.set_cds(cds) self.assertEqual(self.gene.get_cds(), [{'start': 100, 'end': 200}, {'start': 800, 'end': 900}]) # check that CDS ends outside an exon are corrected exons = [(0, 200), (300, 400), (800, 1000)] cds = [(100, 200), (300, 402)] self.gene.set_exons(exons, cds) self.gene.set_cds(cds) self.assertEqual(self.gene.get_cds(), [{'start': 100, 'end': 200}, {'start': 300, 'end': 400}, {'start': 800, 'end': 802}]) cds = [(298, 400), (800, 1000)] self.gene.set_exons(exons, cds) self.gene.set_cds(cds) self.assertEqual(self.gene.get_cds(), [{'start': 198, 'end': 200}, {'start': 300, 'end': 400}, {'start': 800, 'end': 1000}])
async def construct_gene_object(ensembl, transcript_id): """ creates an Transcript object for a gene from ensembl databases Args: ensembl: EnsemblRequest object to request data from ensembl transcript_id: string for an Ensembl transcript ID Returns: a Transcript object, containing transcript coordinates and gene and transcript sequence. Raises: ValueError if CDS from genomic sequence given gene coordinates and CDS retrieved from Ensembl do not match. """ tasks = [ get_genomic_seq_for_transcript(ensembl, transcript_id, expand=10), get_cds_seq_for_transcript(ensembl, transcript_id), get_cds_ranges_for_transcript(ensembl, transcript_id), get_exon_ranges_for_transcript(ensembl, transcript_id) ] (chrom, start, end, strand, genomic), cds_seq, cds, exons = await asyncio.gather(*tasks) # start a Transcript object with the locations and sequence transcript = Transcript(transcript_id, chrom, start, end, strand) transcript.set_exons(exons, cds) transcript.set_cds(cds) transcript.add_cds_sequence(cds_seq) transcript.add_genomic_sequence(genomic, offset=10) return transcript
def construct_gene(self, name='TEST', chrom='1', start=100, end=179, strand='+', exons=[(100, 119), (160, 179)], cds=[(110, 119), (160, 170)]): tx = Transcript(name, chrom, start, end, strand) tx.set_exons(exons, cds) tx.set_cds(cds) return tx
def construct_gene(self, name='TEST', chrom='1', start=1000, end=2000, strand='+', exons=[(1000, 1200), (1800, 2000)], cds=[(1100, 1200), (1800, 1900)]): tx = Transcript(name, chrom, start, end, strand) tx.set_exons(exons, cds) tx.set_cds(cds) return tx
def test_add_genomic_sequencE_without_cds_coords(self): """ test that error is raised if we add gDNA without CDS coords """ a = Transcript("a", "1", 10, 20, "+") a.set_exons([(10, 20)], [(10, 20)]) with self.assertRaises(ValueError): a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5) a.set_cds([(10, 20)]) a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
def test___add__(self): """ test that __add__() works correctly """ exons = [(10, 20), (50, 60), (90, 100)] cds_2 = [(50, 60), (90, 95)] a = Transcript("a", "1", 10, 100, "+") b = Transcript("b", "1", 10, 100, "+") c = Transcript("c", "1", 10, 100, "+") d = Transcript("d", "1", 10, 100, "+") a.set_exons(exons, [(55, 60), (90, 100)]) a.set_cds([(55, 60), (90, 100)]) b.set_exons(exons, [(50, 60), (90, 95)]) b.set_cds([(50, 60), (90, 95)]) c.set_exons([(45, 65)], [(45, 65)]) c.set_cds([(45, 65)]) d.set_exons([(30, 40)], [(30, 40)]) d.set_cds([(30, 40)]) # check that adding two Transcripts gives the union of CDS regions self.assertEqual((a + b).get_cds(), [{'start': 50, 'end': 60}, {'start': 90, 'end': 100}]) self.assertEqual((a + c).get_cds(), [{'start': 45, 'end': 65}, {'start': 90, 'end': 100}]) # check that addition is reversible self.assertEqual((c + a).get_cds(), [{'start': 45, 'end': 65}, {'start': 90, 'end': 100}]) # check that adding previously unknown exons works self.assertEqual((a + d).get_cds(), [{'start': 30, 'end': 40}, {'start': 55, 'end': 60}, {'start': 90, 'end': 100}])
def test___add__not_overlapping(self): ''' test that __add__() works correctly when transcripts do not overlap ''' a = Transcript("a", "1", 10, 50, "+") b = Transcript("b", "1", 60, 80, "+") a.set_exons([(10, 50)], [(10, 50)]) a.set_cds([(10, 50)]) a.add_genomic_sequence('N' * 40) b.set_exons([(60, 80)], [(60, 80)]) b.set_cds([(60, 80)]) b.add_genomic_sequence('N' * 20) self.assertEqual(len((a + b).get_genomic_sequence()), 70)
def construct_gene(self): chrom = "1" name = "TEST" strand = "+" start = 0 end = 70 exons = [(5, 58)] cds = [(5, 58)] transcript = Transcript(name, chrom, start, end, strand) transcript.set_exons(exons, cds) transcript.set_cds(cds) cds = "ATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAA" gdna = "GGGGGATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAACCAGGCCCC" transcript.add_cds_sequence(cds) transcript.add_genomic_sequence(gdna) return transcript
def test___add__cds_length_fixed(self): """ check that we can merge transcripts, even with fixed CDS coords """ a = Transcript("a", "1", 10, 20, "+") a.set_exons([(10, 20)], [(10, 20)]) a.set_cds([(10, 20)]) a.add_cds_sequence('ACTGTACGCAT') a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5) b = Transcript("b", "1", 0, 10, "+") b.set_exons([(0, 10)], [(0, 10)]) b.set_cds([(0, 10)]) b.add_cds_sequence('ACTGTACGCAT') b.add_genomic_sequence('CGTAGACTGTACGCATCGTAG', offset=5) # without a fix to tx.cpp to adjust an exon coordinate simultaneously, # the line below would give an error. c = a + b
def test_get_de_novos_in_transcript(self): """ test that we can identify de novos within the CDS of a transcript """ exon_ranges = [(10, 20), (30, 40), (90, 100)] cds_ranges = [(30, 40), (90, 95)] # define a simple transcript tx = Transcript("test1", '1', 10, 100, "+") tx.set_exons(exon_ranges, cds_ranges) tx.set_cds(cds_ranges) # check that only the site in the CDS is returned sites = [15, 35, 100] self.assertEqual(get_de_novos_in_transcript(tx, sites), [35]) # check that we can return multiple sites in the CDS sites = [15, 35, 90] self.assertEqual(get_de_novos_in_transcript(tx, sites), [35, 90]) # check if we pass in an empty list, we get one back self.assertEqual(get_de_novos_in_transcript(tx, []), [])
def test___add__(self): """ test that __add__() works correctly """ exons = [(10, 20), (50, 60), (90, 100)] cds_2 = [(50, 60), (90, 95)] a = Transcript("a", "1", 10, 100, "+") b = Transcript("b", "1", 10, 100, "+") c = Transcript("c", "1", 10, 100, "+") d = Transcript("d", "1", 10, 100, "+") a.set_exons(exons, [(55, 60), (90, 100)]) a.set_cds([(55, 60), (90, 100)]) b.set_exons(exons, [(50, 60), (90, 95)]) b.set_cds([(50, 60), (90, 95)]) c.set_exons([(45, 65)], [(45, 65)]) c.set_cds([(45, 65)]) d.set_exons([(30, 40)], [(30, 40)]) d.set_cds([(30, 40)]) # check that adding two Transcripts gives the union of CDS regions self.assertEqual((a + b).get_cds(), [{'start': 50, 'end': 60}, {'start': 90, 'end': 100}]) self.assertEqual((a + c).get_cds(), [{'start': 45, 'end': 65}, {'start': 90, 'end': 100}]) # check that addition is reversible self.assertEqual((c + a).get_cds(), [{'start': 45, 'end': 65}, {'start': 90, 'end': 100}]) # check that adding previously unknown exons works self.assertEqual((a + d).get_cds(), [{'start': 30, 'end': 40}, {'start': 55, 'end': 60}, {'start': 90, 'end': 100}]) # check that we can add transcript + None correctly self.assertEqual(a + None, a) self.assertEqual(None + a, a)
def set_transcript(self): """ construct a transcript for a known gene """ exon_ranges=[(120933859, 120934019), (120934219, 120934356), (120935876, 120936296)] cds_ranges=[(120934225, 120934356), (120935876, 120936013)] expected = Transcript("ENST00000242577", '12', 120933859, 120936296, "+") expected.set_exons(exon_ranges, cds_ranges) expected.set_cds(cds_ranges) cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \ "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \ "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \ "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \ "GTTCAAATCTGGTTAA" genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \ "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \ "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \ "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \ "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \ "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \ "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \ "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \ "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \ "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \ "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \ "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \ "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \ "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \ "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \ "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \ "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \ "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \ "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \ "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \ "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \ "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \ "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \ "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \ "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \ "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \ "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \ "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \ "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \ "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \ "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \ "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \ "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \ "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \ "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \ "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \ "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \ "GATGCAGATGTGTATGTGTGTG" expected.add_cds_sequence(cds) expected.add_genomic_sequence(genomic, offset=10) return expected
def set_transcript(self): """ construct a transcript for a known gene """ exon_ranges = [(120933859, 120934019), (120934219, 120934356), (120935876, 120936296)] cds_ranges = [(120934225, 120934356), (120935876, 120936013)] expected = Transcript("ENST00000242577", '12', 120933859, 120936296, "+") expected.set_exons(exon_ranges, cds_ranges) expected.set_cds(cds_ranges) cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \ "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \ "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \ "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \ "GTTCAAATCTGGTTAA" genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \ "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \ "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \ "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \ "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \ "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \ "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \ "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \ "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \ "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \ "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \ "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \ "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \ "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \ "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \ "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \ "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \ "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \ "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \ "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \ "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \ "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \ "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \ "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \ "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \ "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \ "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \ "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \ "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \ "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \ "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \ "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \ "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \ "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \ "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \ "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \ "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \ "GATGCAGATGTGTATGTGTGTG" expected.add_cds_sequence(cds) expected.add_genomic_sequence(genomic, offset=10) return expected