def build_mapping(record): # Only use records on chromosomes we know. try: chromosome = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: raise ValueError() accession, transcript = record['transcript'].split('t') transcript = int(transcript) orientation = 'reverse' if record['strand'] == '-1' else 'forward' if record['cds_start']: cds = record['cds_start'], record['cds_stop'] else: cds = None # TODO: Also take protein into account. For example, in LRG_321 (TP53) # some transcripts occur twice (with different CDSs and different # protein numbers). # https://github.com/mutalyzer/mutalyzer/issues/372 return TranscriptMapping.create_or_update( chromosome, 'lrg', accession, record['gene'], orientation, record['start'], record['stop'], [start for start, _ in record['exons']], [stop for _, stop in record['exons']], 'ebi', transcript=transcript, cds=cds, select_transcript=True)
def build_mappings(records): # We structure the records per transcript and per record type. This is # generalized to a list of records for each type, but we expect only # one GENE record (with `-` as transcript value). # Note that there can be more than one RNA record per transcript if it # is split over different reference contigs. by_transcript = defaultdict(lambda: defaultdict(list)) for r in records: by_transcript[r['transcript']][r['feature_type']].append(r) gene = by_transcript['-']['GENE'][0]['feature_name'] for transcript, by_type in by_transcript.items(): if transcript == '-': continue accession, version = transcript.split('.') version = int(version) chromosome = by_type['RNA'][0]['chromosome'] orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' start = min(t['start'] for t in by_type['RNA']) stop = max(t['stop'] for t in by_type['RNA']) exon_starts = [] exon_stops = [] cds_positions = [] for exon in sorted(by_type['UTR'] + by_type['CDS'], key=itemgetter('start')): if exon_stops and exon_stops[-1] > exon['start'] - 1: # This exon starts before the end of the previous exon. We # have no idea what to do in this case, so we ignore it. # The number of transcripts affected is very small (e.g., # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). continue if exon['feature_type'] == 'CDS': cds_positions.extend([exon['start'], exon['stop']]) if exon_stops and exon_stops[-1] == exon['start'] - 1: # This exon must be merged with the previous one because # it is split over two entries (a CDS part and a UTR part # or split over different reference contigs). exon_stops[-1] = exon['stop'] else: exon_starts.append(exon['start']) exon_stops.append(exon['stop']) if cds_positions: cds = min(cds_positions), max(cds_positions) else: cds = None # If no exons are annotated, we create one spanning the entire # transcript. if not exon_starts: exon_starts = [start] exon_stops = [stop] yield TranscriptMapping.create_or_update( chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds, version=version)
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update(chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()
def import_from_reference(assembly, reference): """ Import transcript mappings from a genomic reference. .. todo: Also report how much was added/updated. .. note: Currently no exon locations are supported, this has only been tested on mtDNA. """ chromosome = assembly.chromosomes.filter_by(name='chrM').one() output = Output(__file__) retriever = Retriever.GenBankRetriever(output) record = retriever.loadrecord(reference) if record.molType != 'm': raise ValueError('Only mitochondial references are supported') select_transcript = len(record.geneList) > 1 for gene in record.geneList: # We support exactly one transcript per gene. try: transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] except IndexError: continue # We use gene.location for now, it is always present and the same # for our purposes. #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] start, stop = gene.location orientation = 'reverse' if gene.orientation == -1 else 'forward' try: cds = transcript.CDS.location except AttributeError: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', record.source_accession, gene.name, orientation, start, stop, [start], [stop], 'reference', cds=cds, select_transcript=select_transcript, version=int(record.source_version)) session.add(mapping) session.commit()
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()
def hg19_transcript_mappings(db, hg19): """ Fixture for some selected transcript mappings in the GRCh37/hg19 genome assembly. """ db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr11').one(), 'refseq', 'NM_003002', 'SDHD', 'forward', 111957571, 111966518, [111957571, 111958581, 111959591, 111965529], [111957683, 111958697, 111959735, 111966518], 'ncbi', transcript=1, cds=(111957632, 111965694), select_transcript=False, version=2)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr11').one(), 'refseq', 'NM_012459', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111957364], [111956186, 111957522], 'ncbi', transcript=1, cds=(111956019, 111957492), select_transcript=False, version=2)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr11').one(), 'refseq', 'NR_028383', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111956702, 111957364], [111956186, 111957034, 111957522], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr6').one(), 'refseq', 'NM_000500', 'CYP21A2', 'forward', 32006082, 32009419, [ 32006082, 32006499, 32006871, 32007133, 32007323, 32007526, 32007782, 32008183, 32008445, 32008646 ], [ 32006401, 32006588, 32007025, 32007234, 32007424, 32007612, 32007982, 32008361, 32008548, 32009419 ], 'ncbi', transcript=1, cds=(32006200, 32008911), select_transcript=False, version=5)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr22').one(), 'refseq', 'NM_001145134', 'CPT1B', 'reverse', 51007290, 51017096, [ 51007290, 51007765, 51008005, 51008722, 51009320, 51009587, 51009804, 51010435, 51010632, 51011304, 51011949, 51012764, 51012922, 51014464, 51014627, 51015286, 51015753, 51016204, 51016978 ], [ 51007510, 51007850, 51008097, 51008835, 51009472, 51009721, 51009968, 51010551, 51010737, 51011489, 51012144, 51012848, 51013029, 51014541, 51014764, 51015463, 51015892, 51016363, 51017096 ], 'ncbi', transcript=1, cds=(51007767, 51016344), select_transcript=False, version=1)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr22').one(), 'refseq', 'NR_021492', 'LOC100144603', 'forward', 51021455, 51022356, [51021455, 51022027], [51021752, 51022356], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chr1').one(), 'refseq', 'NM_001007553', 'CSDE1', 'reverse', 115259538, 115300624, [ 115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280092, 115280584, 115282313, 115292442, 115300546 ], [ 115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280184, 115280693, 115282511, 115292828, 115300624 ], 'ncbi', transcript=1, cds=(115260790, 115282511), select_transcript=False, version=1)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chr1').one(), 'refseq', 'NM_001130523', 'CSDE1', 'reverse', 115259538, 115300671, [ 115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280584, 115282313, 115284148, 115292442, 115300546 ], [ 115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280693, 115282511, 115284294, 115292828, 115300671 ], 'ncbi', transcript=1, cds=(115260790, 115284285), select_transcript=False, version=1)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr1').one(), 'refseq', 'NM_002241', 'KCNJ10', 'reverse', 160007257, 160040051, [160007257, 160039812], [160012322, 160040051], 'ncbi', transcript=1, cds=(160011183, 160012322), select_transcript=False, version=4)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chr20').one(), 'refseq', 'NM_001162505', 'TMEM189', 'reverse', 48740274, 48770335, [48740274, 48744512, 48746083, 48747402, 48760039, 48770054], [48741716, 48744724, 48746227, 48747484, 48760158, 48770335], 'ncbi', transcript=1, cds=(48741595, 48770174), select_transcript=False, version=1)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chr8').one(), 'refseq', 'NM_017780', 'CHD7', 'forward', 61591339, 61779465, [ 61591339, 61653818, 61693559, 61707545, 61712947, 61714087, 61720776, 61728946, 61732566, 61734349, 61734583, 61735062, 61736399, 61741222, 61742881, 61748632, 61749376, 61750227, 61750635, 61754203, 61754406, 61757423, 61757809, 61761074, 61761610, 61763052, 61763591, 61763821, 61764578, 61765057, 61765388, 61766922, 61768534, 61769004, 61773463, 61774755, 61775107, 61777575 ], [ 61591641, 61655656, 61693989, 61707686, 61713084, 61714152, 61720831, 61729060, 61732649, 61734486, 61734704, 61735305, 61736575, 61741365, 61743136, 61748842, 61749571, 61750394, 61750814, 61754313, 61754611, 61757622, 61757968, 61761163, 61761713, 61763181, 61763663, 61763878, 61764806, 61765265, 61766059, 61767082, 61768761, 61769447, 61773684, 61774895, 61775211, 61779465 ], 'ncbi', transcript=1, cds=(61653992, 61778492), select_transcript=False, version=2)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chrM').one(), 'refseq', 'NC_012920', 'ND4', 'forward', 10760, 12137, [10760], [12137], 'reference', transcript=1, cds=(10760, 12137), select_transcript=True, version=1)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chr1').one(), 'refseq', 'NM_002001', 'FCER1A', 'forward', 159259504, 159278014, [159259504, 159272096, 159272644, 159273718, 159275778, 159277538], [159259543, 159272209, 159272664, 159273972, 159276035, 159278014], 'ncbi', transcript=1, cds=(159272155, 159277722), select_transcript=False, version=2)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr7').one(), 'refseq', 'XM_001715131', 'LOC100132858', 'reverse', 19828, 36378, [19828, 20834, 31060, 32957, 35335, 36224], [19895, 21029, 31437, 33107, 35541, 36378], 'ncbi', transcript=1, cds=(19828, 36378), select_transcript=False, version=2)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chrX').one(), 'refseq', 'NM_004011', 'DMD', 'reverse', 31137345, 32430371, [ 31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32430279 ], [ 31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32430371 ], 'ncbi', transcript=1, cds=(31140036, 32430326), select_transcript=False, version=3)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chrX').one(), 'refseq', 'NM_004019', 'DMD', 'reverse', 31196312, 31285024, [ 31196312, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31284927 ], [ 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31285024 ], 'ncbi', transcript=1, cds=(31196782, 31284946), select_transcript=False, version=2)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chrX').one(), 'refseq', 'NM_004007', 'DMD', 'reverse', 31137345, 33038317, [ 31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32456358, 32459297, 32466573, 32472779, 32481556, 32482703, 32486615, 32490281, 32503036, 32509394, 32519872, 32536125, 32563276, 32583819, 32591647, 32591862, 32613874, 32632420, 32662249, 32663081, 32715987, 32717229, 32827610, 32834585, 32841412, 32862900, 32867845, 33038256 ], [ 31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32456507, 32459431, 32466755, 32472949, 32481711, 32482816, 32486827, 32490426, 32503216, 32509635, 32519959, 32536248, 32563451, 32583998, 32591754, 32591963, 32613993, 32632570, 32662430, 32663269, 32716115, 32717410, 32827728, 32834757, 32841504, 32862977, 32867937, 33038317 ], 'ncbi', transcript=1, cds=(31140036, 32834745), select_transcript=False, version=2)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chrX').one(), 'refseq', 'NM_203473', 'PORCN', 'forward', 48367371, 48379202, [ 48367371, 48368172, 48369683, 48370280, 48370714, 48370977, 48371223, 48372628, 48372913, 48374105, 48374278, 48374449, 48375571, 48378763 ], [ 48367491, 48368344, 48369875, 48370323, 48370895, 48371107, 48371240, 48372753, 48373013, 48374181, 48374341, 48374534, 48375681, 48379202 ], 'ncbi', transcript=1, cds=(48368209, 48378864), select_transcript=False, version=1)) db.session.add( TranscriptMapping( hg19.chromosomes.filter_by(name='chrX').one(), 'refseq', 'NM_000132', 'F8', 'reverse', 154064063, 154250998, [ 154064063, 154088707, 154089993, 154091358, 154124352, 154128141, 154129646, 154130326, 154132181, 154132571, 154133086, 154134695, 154156846, 154175973, 154182167, 154185232, 154189350, 154194245, 154194701, 154197606, 154212962, 154215512, 154221211, 154225248, 154227754, 154250685 ], [ 154066027, 154088883, 154090141, 154091502, 154124507, 154128226, 154129717, 154130442, 154132363, 154132799, 154133298, 154134848, 154159951, 154176182, 154182317, 154185446, 154189443, 154194416, 154194962, 154197827, 154213078, 154215580, 154221423, 154225370, 154227875, 154250998 ], 'ncbi', transcript=1, cds=(154065872, 154250827), select_transcript=False, version=3)) db.session.add( TranscriptMapping(hg19.chromosomes.filter_by(name='chr3').one(), 'refseq', 'NM_000249', 'MLH1', 'forward', 37034841, 37092337, [ 37034841, 37038110, 37042446, 37045892, 37048482, 37050305, 37053311, 37053502, 37055923, 37058997, 37061801, 37067128, 37070275, 37081677, 37083759, 37089010, 37090008, 37090395, 37091977 ], [ 37035154, 37038200, 37042544, 37045965, 37048554, 37050396, 37053353, 37053590, 37056035, 37059090, 37061954, 37067498, 37070423, 37081785, 37083822, 37089174, 37090100, 37090508, 37092337 ], 'ncbi', transcript=1, cds=(37035039, 37092144), select_transcript=False, version=3)) db.session.commit()