Esempio n. 1
0
 def get_all_gene_ids(self):
     """
     List of all gene IDs (arbitrary order) that we consider in xbrowse
     This does *not* include genes are:
     - not ensembl genes
     - not on a standard chromosome (in genomeloc)
     """
     cursor = self.db_conn.cursor()
     cursor.execute("select gene.stable_id, seq_region.name from gene "
                    "join seq_region on gene.seq_region_id=seq_region.seq_region_id")
     return [row[0] for row in cursor if ensembl_parsing_utils.get_chr_from_seq_region_name(row[1]) is not None]
Esempio n. 2
0
 def get_all_gene_ids(self):
     """
     List of all gene IDs (arbitrary order) that we consider in xbrowse
     This does *not* include genes are:
     - not ensembl genes
     - not on a standard chromosome (in genomeloc)
     """
     cursor = self.db_conn.cursor()
     cursor.execute("select gene.stable_id, seq_region.name from gene "
                    "join seq_region on gene.seq_region_id=seq_region.seq_region_id")
     return [row[0] for row in cursor if ensembl_parsing_utils.get_chr_from_seq_region_name(row[1]) is not None]
Esempio n. 3
0
 def get_all_exons(self):
     """
     Get a list of all exons (order not guaranteed) from ensembl
     Fetched from database, not REST
     """
     cursor = self.db_conn.cursor()
     cursor.execute("select exon.stable_id, seq_region.name, exon.seq_region_start, exon.seq_region_end from exon "
                    "join seq_region on exon.seq_region_id=seq_region.seq_region_id")
     exons = []
     for row in cursor:
         chr = ensembl_parsing_utils.get_chr_from_seq_region_name(row[1])
         start = row[2]
         stop = row[3]
         if chr is None:
             continue
         exon = dict(exon_id=row[0])
         exon['xstart'] = genomeloc.get_single_location(chr, start)
         exon['xstop'] = genomeloc.get_single_location(chr, stop)
         exons.append(exon)
     return exons
Esempio n. 4
0
 def get_all_exons(self):
     """
     Get a list of all exons (order not guaranteed) from ensembl
     Fetched from database, not REST
     """
     cursor = self.db_conn.cursor()
     cursor.execute("select exon.stable_id, seq_region.name, exon.seq_region_start, exon.seq_region_end from exon "
                    "join seq_region on exon.seq_region_id=seq_region.seq_region_id")
     exons = []
     for row in cursor:
         chr = ensembl_parsing_utils.get_chr_from_seq_region_name(row[1])
         start = row[2]
         stop = row[3]
         if chr is None:
             continue
         exon = dict(exon_id=row[0])
         exon['xstart'] = genomeloc.get_single_location(chr, start)
         exon['xstop'] = genomeloc.get_single_location(chr, stop)
         exons.append(exon)
     return exons
Esempio n. 5
0
    def get_gene_structure(self, gene_id):
        """
        Query ensembl API for the transcript/exon structure of a gene
        This is the foundation of the elements in db.genes
        Exception if can't process gene
        """

        gene = {}

        # gene basics
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'gene'}
        gene_list_json = requests.get(url, params=params).json()
        gene_list_json = [
            item for item in gene_list_json if item['ID'] == gene_id
        ]
        if len(gene_list_json) == 0:
            raise Exception("No genes with ID %s" % gene_id)
        if len(gene_list_json) > 1:
            raise Exception(">1 ensembl genes with ID %s" % gene_id)
        gene_json = gene_list_json[0]

        chr = ensembl_parsing_utils.get_chr_from_seq_region_name(
            gene_json['seq_region_name'])
        if chr is None:
            raise Exception("Gene %s is on a nonstandard chromosome: %s" %
                            (gene_id, chr))

        gene['chr'] = chr
        gene['start'] = gene_json['start']
        gene['stop'] = gene_json['end']
        gene['xstart'] = genomeloc.get_single_location(chr, gene['start'])
        gene['xstop'] = genomeloc.get_single_location(chr, gene['stop'])

        gene['gene_id'] = gene_json['ID']
        gene['symbol'] = gene_json['external_name']
        gene['description'] = gene_json['description']
        gene['biotype'] = gene_json['biotype']

        # transcripts
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'transcript'}
        transcript_json = [
            t for t in requests.get(url, params=params).json()
            if t['Parent'] == gene_id
        ]

        gene['transcripts'] = []
        for t in transcript_json:
            transcript_id = t['ID']
            transcript = dict(transcript_id=transcript_id,
                              biotype=t['biotype'],
                              start=t['start'],
                              stop=t['end'])
            transcript['xstart'] = genomeloc.get_single_location(
                chr, transcript['start'])
            transcript['xstop'] = genomeloc.get_single_location(
                chr, transcript['stop'])

            # exons_for_transcript
            url = self._get_rest_url() + '/feature/id/%s' % transcript_id
            params = {'content-type': 'application/json', 'feature': 'exon'}
            transcript_exon_json = requests.get(url, params=params).json()
            transcript['exons'] = [
                e['ID']
                for e in sorted(transcript_exon_json, key=lambda x: x['start'])
                if e['Parent'] == transcript_id
            ]

            gene['transcripts'].append(transcript)

        # exons
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'exon'}
        exon_json = requests.get(url, params=params).json()

        transcript_ids = {t['transcript_id'] for t in gene['transcripts']}
        exon_ids_seen = set()
        gene['exons'] = []
        for e in exon_json:
            exon_id = e['ID']
            # skip exons that aren't actually in one of this gene's transcripts
            if e['Parent'] not in transcript_ids:
                continue
            if exon_id in exon_ids_seen:
                continue
            exon = {
                'exon_id': exon_id,
                'start': e['start'],
                'stop': e['end'],
            }
            exon['xstart'] = genomeloc.get_single_location(chr, exon['start'])
            exon['xstop'] = genomeloc.get_single_location(chr, exon['stop'])
            gene['exons'].append(exon)
            exon_ids_seen.add(e['ID'])

        # cds
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'cds'}
        cds_json = requests.get(url, params=params).json()

        cds_map = {}  # map from (start, stop) -> {start, stop, transcripts}
        for c in cds_json:
            # skip exons that aren't actually in one of this gene's transcripts
            if c['Parent'] not in transcript_ids:
                continue
            cds_t = (c['start'], c['end'])
            if cds_t not in cds_map:
                cds_map[cds_t] = {
                    'start': c['start'],
                    'stop': c['end'],
                    'xstart': genomeloc.get_single_location(chr, c['start']),
                    'xstop': genomeloc.get_single_location(chr, c['end']),
                    'transcripts': [],
                }
            cds_map[cds_t]['transcripts'].append(c['Parent'])
        gene['cds'] = sorted(cds_map.values(),
                             key=lambda x: (x['start'], x['stop']))
        for i, cds in enumerate(gene['cds']):
            cds['cds_id'] = '%s-%i' % (gene['gene_id'], i + 1)
        return gene
Esempio n. 6
0
    def get_gene_structure(self, gene_id):
        """
        Query ensembl API for the transcript/exon structure of a gene
        This is the foundation of the elements in db.genes
        Exception if can't process gene
        """

        gene = {}

        # gene basics
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'gene'}
        gene_list_json = requests.get(url, params=params).json()
        gene_list_json = [item for item in gene_list_json if item['ID'] == gene_id]
        if len(gene_list_json) == 0:
            raise Exception("No genes with ID %s" % gene_id)
        if len(gene_list_json) > 1:
            raise Exception(">1 ensembl genes with ID %s" % gene_id)
        gene_json = gene_list_json[0]

        chr = ensembl_parsing_utils.get_chr_from_seq_region_name(gene_json['seq_region_name'])
        if chr is None:
            raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr) )

        gene['chr'] = chr
        gene['start'] = gene_json['start']
        gene['stop'] = gene_json['end']
        gene['xstart'] = genomeloc.get_single_location(chr, gene['start'])
        gene['xstop'] = genomeloc.get_single_location(chr, gene['stop'])

        gene['gene_id'] = gene_json['ID']
        gene['symbol'] = gene_json['external_name']
        gene['description'] = gene_json['description']
        gene['biotype'] = gene_json['biotype']

        # transcripts
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'transcript'}
        transcript_json = [t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id]

        gene['transcripts'] = []
        for t in transcript_json:
            transcript_id = t['ID']
            transcript = dict(
                transcript_id=transcript_id,
                biotype=t['biotype'],
                start=t['start'],
                stop=t['end']
            )
            transcript['xstart'] = genomeloc.get_single_location(chr, transcript['start'])
            transcript['xstop'] = genomeloc.get_single_location(chr, transcript['stop'])

            # exons_for_transcript
            url = self._get_rest_url() + '/feature/id/%s' % transcript_id
            params = {'content-type': 'application/json', 'feature': 'exon'}
            transcript_exon_json = requests.get(url, params=params).json()
            transcript['exons'] = [
                e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id
            ]

            gene['transcripts'].append(transcript)

        # exons
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'exon'}
        exon_json = requests.get(url, params=params).json()

        transcript_ids = {t['transcript_id'] for t in gene['transcripts']}
        exon_ids_seen = set()
        gene['exons'] = []
        for e in exon_json:
            exon_id = e['ID']
            # skip exons that aren't actually in one of this gene's transcripts
            if e['Parent'] not in transcript_ids:
                continue
            if exon_id in exon_ids_seen:
                continue
            exon = {
                'exon_id': exon_id,
                'start': e['start'],
                'stop': e['end'],
            }
            exon['xstart'] = genomeloc.get_single_location(chr, exon['start'])
            exon['xstop'] = genomeloc.get_single_location(chr, exon['stop'])
            gene['exons'].append(exon)
            exon_ids_seen.add(e['ID'])

        # cds
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'cds'}
        cds_json = requests.get(url, params=params).json()

        cds_map = {}  # map from (start, stop) -> {start, stop, transcripts}
        for c in cds_json:
            # skip exons that aren't actually in one of this gene's transcripts
            if c['Parent'] not in transcript_ids:
                continue
            cds_t = (c['start'], c['end'])
            if cds_t not in cds_map:
                cds_map[cds_t] = {
                    'start': c['start'],
                    'stop': c['end'],
                    'xstart': genomeloc.get_single_location(chr, c['start']),
                    'xstop': genomeloc.get_single_location(chr, c['end']),
                    'transcripts': [],
                }
            cds_map[cds_t]['transcripts'].append(c['Parent'])
        gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop']))
        for i, cds in enumerate(gene['cds']):
            cds['cds_id'] = '%s-%i' % (gene['gene_id'], i+1)
        return gene