def get_all_gene_ids(self): """ List of all gene IDs (arbitrary order) that we consider in xbrowse This does *not* include genes are: - not ensembl genes - not on a standard chromosome (in genomeloc) """ cursor = self.db_conn.cursor() cursor.execute("select gene.stable_id, seq_region.name from gene " "join seq_region on gene.seq_region_id=seq_region.seq_region_id") return [row[0] for row in cursor if ensembl_parsing_utils.get_chr_from_seq_region_name(row[1]) is not None]
def get_all_exons(self): """ Get a list of all exons (order not guaranteed) from ensembl Fetched from database, not REST """ cursor = self.db_conn.cursor() cursor.execute("select exon.stable_id, seq_region.name, exon.seq_region_start, exon.seq_region_end from exon " "join seq_region on exon.seq_region_id=seq_region.seq_region_id") exons = [] for row in cursor: chr = ensembl_parsing_utils.get_chr_from_seq_region_name(row[1]) start = row[2] stop = row[3] if chr is None: continue exon = dict(exon_id=row[0]) exon['xstart'] = genomeloc.get_single_location(chr, start) exon['xstop'] = genomeloc.get_single_location(chr, stop) exons.append(exon) return exons
def get_gene_structure(self, gene_id): """ Query ensembl API for the transcript/exon structure of a gene This is the foundation of the elements in db.genes Exception if can't process gene """ gene = {} # gene basics url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'gene'} gene_list_json = requests.get(url, params=params).json() gene_list_json = [ item for item in gene_list_json if item['ID'] == gene_id ] if len(gene_list_json) == 0: raise Exception("No genes with ID %s" % gene_id) if len(gene_list_json) > 1: raise Exception(">1 ensembl genes with ID %s" % gene_id) gene_json = gene_list_json[0] chr = ensembl_parsing_utils.get_chr_from_seq_region_name( gene_json['seq_region_name']) if chr is None: raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr)) gene['chr'] = chr gene['start'] = gene_json['start'] gene['stop'] = gene_json['end'] gene['xstart'] = genomeloc.get_single_location(chr, gene['start']) gene['xstop'] = genomeloc.get_single_location(chr, gene['stop']) gene['gene_id'] = gene_json['ID'] gene['symbol'] = gene_json['external_name'] gene['description'] = gene_json['description'] gene['biotype'] = gene_json['biotype'] # transcripts url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'transcript'} transcript_json = [ t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id ] gene['transcripts'] = [] for t in transcript_json: transcript_id = t['ID'] transcript = dict(transcript_id=transcript_id, biotype=t['biotype'], start=t['start'], stop=t['end']) transcript['xstart'] = genomeloc.get_single_location( chr, transcript['start']) transcript['xstop'] = genomeloc.get_single_location( chr, transcript['stop']) # exons_for_transcript url = self._get_rest_url() + '/feature/id/%s' % transcript_id params = {'content-type': 'application/json', 'feature': 'exon'} transcript_exon_json = requests.get(url, params=params).json() transcript['exons'] = [ e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id ] gene['transcripts'].append(transcript) # exons url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'exon'} exon_json = requests.get(url, params=params).json() transcript_ids = {t['transcript_id'] for t in gene['transcripts']} exon_ids_seen = set() gene['exons'] = [] for e in exon_json: exon_id = e['ID'] # skip exons that aren't actually in one of this gene's transcripts if e['Parent'] not in transcript_ids: continue if exon_id in exon_ids_seen: continue exon = { 'exon_id': exon_id, 'start': e['start'], 'stop': e['end'], } exon['xstart'] = genomeloc.get_single_location(chr, exon['start']) exon['xstop'] = genomeloc.get_single_location(chr, exon['stop']) gene['exons'].append(exon) exon_ids_seen.add(e['ID']) # cds url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'cds'} cds_json = requests.get(url, params=params).json() cds_map = {} # map from (start, stop) -> {start, stop, transcripts} for c in cds_json: # skip exons that aren't actually in one of this gene's transcripts if c['Parent'] not in transcript_ids: continue cds_t = (c['start'], c['end']) if cds_t not in cds_map: cds_map[cds_t] = { 'start': c['start'], 'stop': c['end'], 'xstart': genomeloc.get_single_location(chr, c['start']), 'xstop': genomeloc.get_single_location(chr, c['end']), 'transcripts': [], } cds_map[cds_t]['transcripts'].append(c['Parent']) gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop'])) for i, cds in enumerate(gene['cds']): cds['cds_id'] = '%s-%i' % (gene['gene_id'], i + 1) return gene
def get_gene_structure(self, gene_id): """ Query ensembl API for the transcript/exon structure of a gene This is the foundation of the elements in db.genes Exception if can't process gene """ gene = {} # gene basics url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'gene'} gene_list_json = requests.get(url, params=params).json() gene_list_json = [item for item in gene_list_json if item['ID'] == gene_id] if len(gene_list_json) == 0: raise Exception("No genes with ID %s" % gene_id) if len(gene_list_json) > 1: raise Exception(">1 ensembl genes with ID %s" % gene_id) gene_json = gene_list_json[0] chr = ensembl_parsing_utils.get_chr_from_seq_region_name(gene_json['seq_region_name']) if chr is None: raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr) ) gene['chr'] = chr gene['start'] = gene_json['start'] gene['stop'] = gene_json['end'] gene['xstart'] = genomeloc.get_single_location(chr, gene['start']) gene['xstop'] = genomeloc.get_single_location(chr, gene['stop']) gene['gene_id'] = gene_json['ID'] gene['symbol'] = gene_json['external_name'] gene['description'] = gene_json['description'] gene['biotype'] = gene_json['biotype'] # transcripts url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'transcript'} transcript_json = [t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id] gene['transcripts'] = [] for t in transcript_json: transcript_id = t['ID'] transcript = dict( transcript_id=transcript_id, biotype=t['biotype'], start=t['start'], stop=t['end'] ) transcript['xstart'] = genomeloc.get_single_location(chr, transcript['start']) transcript['xstop'] = genomeloc.get_single_location(chr, transcript['stop']) # exons_for_transcript url = self._get_rest_url() + '/feature/id/%s' % transcript_id params = {'content-type': 'application/json', 'feature': 'exon'} transcript_exon_json = requests.get(url, params=params).json() transcript['exons'] = [ e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id ] gene['transcripts'].append(transcript) # exons url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'exon'} exon_json = requests.get(url, params=params).json() transcript_ids = {t['transcript_id'] for t in gene['transcripts']} exon_ids_seen = set() gene['exons'] = [] for e in exon_json: exon_id = e['ID'] # skip exons that aren't actually in one of this gene's transcripts if e['Parent'] not in transcript_ids: continue if exon_id in exon_ids_seen: continue exon = { 'exon_id': exon_id, 'start': e['start'], 'stop': e['end'], } exon['xstart'] = genomeloc.get_single_location(chr, exon['start']) exon['xstop'] = genomeloc.get_single_location(chr, exon['stop']) gene['exons'].append(exon) exon_ids_seen.add(e['ID']) # cds url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'cds'} cds_json = requests.get(url, params=params).json() cds_map = {} # map from (start, stop) -> {start, stop, transcripts} for c in cds_json: # skip exons that aren't actually in one of this gene's transcripts if c['Parent'] not in transcript_ids: continue cds_t = (c['start'], c['end']) if cds_t not in cds_map: cds_map[cds_t] = { 'start': c['start'], 'stop': c['end'], 'xstart': genomeloc.get_single_location(chr, c['start']), 'xstop': genomeloc.get_single_location(chr, c['end']), 'transcripts': [], } cds_map[cds_t]['transcripts'].append(c['Parent']) gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop'])) for i, cds in enumerate(gene['cds']): cds['cds_id'] = '%s-%i' % (gene['gene_id'], i+1) return gene