Ejemplo n.º 1
0
 def __init__(self, cache_folder, genome_build):
     """ obtain the sequence for a transcript from ensembl
     
     Args:
         cache_folder: path to folder for caching data requested from Ensembl
         genome_build: string indicating the genome build ("grch37" or "grch38")
     """
     
     self.cache = EnsemblCache(cache_folder, genome_build)
     
     self.prior_time = time.time() - 1
     self.rate_limit = 0.067
     
     server_dict = {"grch37": "grch37.", "grch38": ""}
     
     self.server = "http://{}rest.ensembl.org".format(server_dict[genome_build])
     
     self.check_ensembl_api_version()
Ejemplo n.º 2
0
 def __init__(self, cache_folder, genome_build):
     """ obtain the sequence for a transcript from ensembl
     
     Args:
         cache_folder: path to folder for caching data requested from Ensembl
         genome_build: string indicating the genome build ("grch37" or "grch38")
     """
     
     self.cache = EnsemblCache(cache_folder, genome_build)
     
     self.prior_time = time.time() - 1
     self.rate_limit = 0.067
     
     server_dict = {"grch37": "grch37.", "grch38": ""}
     
     self.server = "http://{}rest.ensembl.org".format(server_dict[genome_build])
     
     self.check_ensembl_api_version()
Ejemplo n.º 3
0
def ensembl_cache(folder=None):
    ''' store/retrive repeated ensembl requests from a persistent sqlite cache
    '''
    if folder is None:
        folder = Path.home() / '.cache' / 'ensembl'
    cache = EnsemblCache(str(folder))

    def decorator(func):
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            url = args[1]
            if 'rest.ensembl.org' in url:
                cached = cache.get_cached_data(url)
                if cached is not None:
                    return cached
                data = await func(*args, **kwargs)
                cache.cache_url_data(url, data)
                return data
            else:
                return await func(*args, **kwargs)

        return wrapper

    return decorator
Ejemplo n.º 4
0
 def setUpClass(self):
     self.temp_dir = tempfile.mkdtemp()
     self.cache = EnsemblCache(self.temp_dir)
Ejemplo n.º 5
0
class TestEnsemblCachePy(unittest.TestCase):
    """ unit test the EnsemblCache class
    """
    
    @classmethod
    def setUpClass(self):
        self.temp_dir = tempfile.mkdtemp()
        self.cache = EnsemblCache(self.temp_dir)
    
    @classmethod
    def tearDownClass(self):
        shutil.rmtree(self.temp_dir)
    
    def test_get_key_from_url(self):
        """ test that get_key_from_url() works correctly
        """
        
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/info/rest"), ("info.rest", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/xrefs/symbol/homo_sapiens/ABO"), ("xrefs.symbol.homo_sapiens.ABO", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/sequence/id/ENST00000378520?type=protein"), ("sequence.id.ENST00000378520.protein", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/feature/id/ENSG00000175164?feature=transcript"), ("feature.id.ENSG00000175164.transcript", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/sequence/id/ENST00000538324?type=genomic;expand_3prime=10;expand_5prime=10"), ("sequence.id.ENST00000538324.genomic", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/sequence/id/ENST00000538324?type=cds"), ("sequence.id.ENST00000538324.cds", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/feature/id/ENST00000538324?feature=exon"), ("feature.id.ENST00000538324.exon", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/vep/human/id/rs3887873/consequences?"), ("vep.human.id.rs3887873.consequences", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://rest.ensembl.org/vep/human/9:22125503-22125502:1/C/consequences?"), ("vep.human.9_22125503-22125502_1.C.consequences", 'grch38'))
        self.assertEqual(self.cache.get_key_from_url("http://grch37.rest.ensembl.org/vep/human/9:22125503-22125502:1/C/consequences?"), ("vep.human.9_22125503-22125502_1.C.consequences", 'grch37'))
    
    def test_get_cached_data(self):
        """ test that get_cached_data() works correctly
        """
        
        # set up the data to go in the database
        url = "http://rest.ensembl.org/feature/id/temp1?feature=exon"
        string = b"temp_data"
        
        # check that the data is not in the database to start
        self.assertIsNone(self.cache.get_cached_data(url))
        
        # insert the data in the database
        self.cache.cache_url_data(url, string)
        
        # check that some data is now in the database
        data = self.cache.get_cached_data(url)
        self.assertIsNotNone(data)
        
        # check that the data is correct if the row is in the database
        self.assertEqual(data, string)
    
    def test_get_cached_data_old_date(self):
        """ check that the cache ignores outdated data
        """
        url = "http://rest.ensembl.org/feature/id/temp1?feature=exon"
        string = b"temp_data"
        today = datetime.today()
        long_ago = today - timedelta(days=181)
        
        # check that obsolete data returns False
        self.cache.today = long_ago
        self.cache.cache_url_data(url, string)
        self.assertIsNotNone(self.cache.get_cached_data(url))
        
        self.cache.today = today
        self.assertIsNone(self.cache.get_cached_data(url))
    
    def test_cache_url_data(self):
        """ test that cache_url_data works correctly
        """
        
        # set up the data to go in the database
        url = "http://rest.ensembl.org/feature/id/temp2?feature=exon"
        temp_data = b"temp_data"
        
        # check that the data is not in before we insert it
        self.assertIsNone(self.cache.get_cached_data(url))
        
        # insert the data, then check that it has gone in
        self.cache.cache_url_data(url, temp_data)
        self.assertIsNotNone(self.cache.get_cached_data(url))
    
    def test_cache_load(self):
        """ make sure the cache can handle a reasonable load
    
        This test uses multiple threads writing to the cache simultaneously to
        show the cache can handle the load. Failure is shown by an exception.
        """
    
        cache_dir = os.path.join(self.temp_dir, 'loading')
        os.mkdir(cache_dir)
        text = lambda l: '{:x}'.format(random.getrandbits(l * 4)).strip().encode('utf8')
        url = lambda : 'example.com/base/sub/{}'.format(text(10))
        write = lambda cache: cache.cache_url_data(url(), text(100))
        
        class Runner(Thread):
            def __init__(self, counter=100):
                super(Runner, self).__init__()
                self.counter = counter
            def run(self):
                cache = EnsemblCache(cache_dir)
                while self.counter > 0:
                    write(cache)
                    self.counter -= 1
        
        try:
            threads = [ Runner() for x in range(50) ]
            [ x.start() for x in threads ]
            [ x.join() for x in threads ]
        except:
            self.fail("EnsemblCache failed under heavy load")
Ejemplo n.º 6
0
 def run(self):
     cache = EnsemblCache(cache_dir)
     while self.counter > 0:
         write(cache)
         self.counter -= 1
Ejemplo n.º 7
0
class EnsemblRequest(object):
    """ Uses the Ensembl REST API to obtain gene information from Ensembl.
    
    Can find:
         - gene IDs for a HGNC symbol
         - transcript IDs for a gene ID
         - exon coordinates for an ensembl transcript ID
         - CDS coordinates for an ensembl transcript ID
         - transcript and genomic DNA sequences for an ensembl transcript ID
    """
    
    def __init__(self, cache_folder, genome_build):
        """ obtain the sequence for a transcript from ensembl
        
        Args:
            cache_folder: path to folder for caching data requested from Ensembl
            genome_build: string indicating the genome build ("grch37" or "grch38")
        """
        
        self.cache = EnsemblCache(cache_folder, genome_build)
        
        self.prior_time = time.time() - 1
        self.rate_limit = 0.067
        
        server_dict = {"grch37": "grch37.", "grch38": ""}
        
        self.server = "http://{}rest.ensembl.org".format(server_dict[genome_build])
        
        self.check_ensembl_api_version()
    
    def check_ensembl_api_version(self):
        """ check the ensembl api version matches a currently working version
        
        This function is included so when the api version changes, we notice the
        change, and we can manually check the responses for the new version.
        """
        
        self.attempt = 0
        headers = {"content-type": "application/json"}
        ext = "/info/rest"
        r = self.ensembl_request(ext, headers)
        
        response = json.loads(r)
        self.cache.set_ensembl_api_version(response["release"])
    
    def open_url(self, url, headers):
        """ open url with python libraries
        """
        
        data = self.cache.get_cached_data(url)
        if data is not None:
            return data, 200, headers
        
        self.rate_limit_ensembl_requests()
        req = request.Request(url, headers=headers)
        
        try:
            handler = request.urlopen(req)
        except HTTPError as error:
            # if we get a http error, we still process the status code, since a
            # later step deals with different status codes differently.
            handler = error
        except (URLError, ConnectionResetError, TimeoutError):
            # if we get a ConnectionResetError, assume something has gone wrong
            # with the server. Later code will wait before retrying.
            return '', 500, headers
        
        status_code = handler.getcode()
        response = handler.read()
        if IS_PYTHON3:
            response = response.decode("utf-8")
        
        # parse the headers into a key, value dictionary
        headers = dict(zip(map(str.lower, handler.headers.keys()), handler.headers.values()))
        
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        logging.warning("{}\t{}\t{}".format(now, status_code, url))
        
        return response, status_code, headers
    
    def ensembl_request(self, ext, headers):
        """ obtain sequence via the ensembl REST API
        """
        
        self.attempt += 1
        if self.attempt > 5:
            raise ValueError("too many attempts, figure out why its failing")
        
        response, status, requested_headers = self.open_url(self.server + ext, headers=headers)
        
        # we might end up passing too many simultaneous requests, or too many
        # requests per hour, just wait until the period is finished before
        # retrying
        if status == 429:
            if "retry-after" in requested_headers:
                time.sleep(float(requested_headers["retry-after"]))
            elif "x-ratelimit-reset" in requested_headers:
                time.sleep(int(requested_headers["x-ratelimit-reset"]))
            
            return self.ensembl_request(ext, headers)
        # retry after 30 seconds if we get service unavailable error
        elif status in [500, 503, 504]:
            time.sleep(30)
            return self.ensembl_request(ext, headers)
        elif status != 200:
            raise ValueError("Invalid Ensembl response for {}\nheaders: {}\nresponse: {}".format(\
                    self.server + ext, requested_headers, response))
        
        # sometimes ensembl returns odd data. I don't know what it is, but the
        # json interpreter can't handle it. Rather than trying to catch it,
        # simply re-request the data
        if requested_headers["content-type"] == "application/json":
            try:
                json.loads(response)
            except ValueError:
                now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
                logging.warning("{}\t{}\t{}\t{}\t{}".format(now,
                    status, self.server + ext,
                    "cannot obtain json output"))
                return self.ensembl_request(ext, requested_headers)
        
        self.cache.cache_url_data(self.server + ext, response)
        
        return response
    
    def get_genes_for_hgnc_id(self, hgnc_symbol):
        """ obtain the ensembl gene IDs that correspond to a HGNC symbol
        """
        
        headers = {"content-type": "application/json"}
        
        # http://grch37.rest.ensembl.org/xrefs/symbol/homo_sapiens/KMT2A?content-type=application/json
        
        self.attempt = 0
        ext = "/xrefs/symbol/homo_sapiens/{}".format(hgnc_symbol)
        r = self.ensembl_request(ext, headers)
        
        genes = []
        for item in json.loads(r):
            if item["type"] == "gene":
                genes.append(item["id"])
        
        return genes
    
    def get_previous_symbol(self, hgnc_symbol):
        """ sometimes we get HGNC symbols that do not match the ensembl rest version
        that we are currently using. We can look for earlier HGNC symbols for
        the gene using the service at rest.genenames.org
        
        Args:
            hgnc_symbol: HGNC symbol for the gene (eg "MLL2")
        
        Returns:
            list of deprecated gene symbols (eg ["KMT2A"])
        """
        
        ensembl_server = self.server
        gene_names_server = "http://rest.genenames.org"
        
        self.server = gene_names_server
        headers = {"accept": "application/json", "content-type": "application/json"}
        ext = "/fetch/symbol/{}".format(hgnc_symbol)
        try:
            r = self.ensembl_request(ext, headers)
        finally:
            self.server = ensembl_server
        
        gene_json = json.loads(r)
        
        prev_gene = []
        docs = gene_json["response"]["docs"]
        
        # strip out any gene entries that have been invalidated
        docs = [ x for x in docs if x["status"] != "Entry Withdrawn"]
        
        if len(docs) == 0:
            pass
        elif len(docs) > 1:
            raise ValueError("{0} has more than one alternate symbol, which I haven't accounted for.".format(hgnc_symbol))
        elif "prev_symbol" in docs[0]:
            prev_gene = docs[0]["prev_symbol"]
        
        return prev_gene
        
    def get_transcript_ids_for_ensembl_gene_ids(self, gene_ids, hgnc_symbols):
        """ fetch the ensembl transcript IDs for a given ensembl gene ID.
        
        Args:
            gene_ids: list of Ensembl gene IDs for the gene
            hgnc_symbols: list of possible HGNC symbols for gene
        """
        
        chroms = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", \
             "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", \
              "X", "Y"}
        
        headers = {"content-type": "application/json"}
        
        transcript_ids = []
        for gene_id in gene_ids:
            self.attempt = 0
            ext = "/overlap/id/{}?feature=transcript".format(gene_id)
            r = self.ensembl_request(ext, headers)
            
            for item in json.loads(r):
                # ignore non-coding transcripts
                if item["biotype"] not in ["protein_coding", "polymorphic_pseudogene"]:
                    continue
                
                # ignore transcripts not on the standard chromosomes
                # (non-default chroms fail to map the known de novo variants
                # to the gene location
                if item["Parent"] != gene_id or item["seq_region_name"] not in \
                        chroms or \
                        all([symbol not in item["external_name"] for symbol in hgnc_symbols]):
                    continue
                transcript_ids.append(item["id"])
        
        return transcript_ids
    
    def get_genomic_seq_for_transcript(self, transcript_id, expand):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/sequence/id/{0}?type=genomic;expand_3prime={1};expand_5prime={1}".format(transcript_id, expand)
        r = self.ensembl_request(ext, headers)
        
        gene = json.loads(r)
        
        seq = gene["seq"]
        seq_id = gene["id"]
        
        if seq_id != transcript_id:
            raise ValueError("ensembl gave the wrong transcript")
        
        desc = gene["desc"].split(":")
        chrom = desc[2]
        start = int(desc[3]) + expand
        end = int(desc[4]) - expand
        strand_temp = int(desc[5])
        
        strand = "+"
        if strand_temp == -1:
            strand = "-"
        
        return (chrom, start, end, strand, seq)
    
    def get_cds_seq_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/id/{}?type=cds".format(transcript_id)
        
        return self.ensembl_request(ext,  headers)
    
    def get_protein_seq_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/id/{}?type=protein".format(transcript_id)
        
        return self.ensembl_request(ext, headers)
    
    def get_genomic_seq_for_region(self, chrom, start_pos, end_pos):
        """ obtain the sequence for a genomic region
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/region/human/{}:{}..{}:1".format(chrom, start_pos, end_pos)
        
        return self.ensembl_request(ext, headers)
    
    def get_chrom_for_transcript(self, transcript_id, hgnc_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=gene".format(transcript_id)
        r =  self.ensembl_request(ext, headers)
        
        for gene in json.loads(r):
            if gene["external_name"] == hgnc_id:
                return gene["seq_region_name"]
        
        return None
    
    def get_exon_ranges_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=exon".format(transcript_id)
        r = self.ensembl_request(ext, headers)
        
        exon_ranges = []
        for exon in json.loads(r):
            if exon["Parent"] != transcript_id:
                continue
            
            start = exon["start"]
            end = exon["end"]
            
            exon_ranges.append((start, end))
        
        return exon_ranges
    
    def get_cds_ranges_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=cds".format(transcript_id)
        r = self.ensembl_request(ext, headers)
        
        cds_ranges = []
        for cds_range in json.loads(r):
            if cds_range["Parent"] != transcript_id:
                continue
            
            start = cds_range["start"]
            end = cds_range["end"]
            
            cds_ranges.append((start, end))
        
        return cds_ranges
    
    def rate_limit_ensembl_requests(self):
        """ limit ensembl requests to one per 0.067 s
        """
        
        current_time = time.time()
        diff_time = current_time - self.prior_time
        
        # sleep until the current rate limit period is finished
        if diff_time < self.rate_limit:
            time.sleep(self.rate_limit - diff_time)
        
        # set the access time to now, for later checks
        self.prior_time = time.time()
Ejemplo n.º 8
0
class EnsemblRequest(object):
    """ Uses the Ensembl REST API to obtain gene information from Ensembl.
    
    Can find:
         - gene IDs for a HGNC symbol
         - transcript IDs for a gene ID
         - exon coordinates for an ensembl transcript ID
         - CDS coordinates for an ensembl transcript ID
         - transcript and genomic DNA sequences for an ensembl transcript ID
    """
    
    def __init__(self, cache_folder, genome_build):
        """ obtain the sequence for a transcript from ensembl
        
        Args:
            cache_folder: path to folder for caching data requested from Ensembl
            genome_build: string indicating the genome build ("grch37" or "grch38")
        """
        
        self.cache = EnsemblCache(cache_folder, genome_build)
        
        self.prior_time = time.time() - 1
        self.rate_limit = 0.067
        
        server_dict = {"grch37": "grch37.", "grch38": ""}
        
        self.server = "http://{}rest.ensembl.org".format(server_dict[genome_build])
        
        self.check_ensembl_api_version()
    
    def check_ensembl_api_version(self):
        """ check the ensembl api version matches a currently working version
        
        This function is included so when the api version changes, we notice the
        change, and we can manually check the responses for the new version.
        """
        
        self.attempt = 0
        headers = {"content-type": "application/json"}
        ext = "/info/rest"
        r = self.ensembl_request(ext, headers)
        
        response = json.loads(r)
        release = response["release"].split(".")
        self.cache.set_ensembl_api_version(response["release"])
    
    def open_url(self, url, headers):
        """ open url with python libraries
        """
        
        data = self.cache.get_cached_data(url)
        if data is not None:
            return data, 200, headers
        
        self.rate_limit_ensembl_requests()
        req = request.Request(url, headers=headers)
        
        try:
            handler = request.urlopen(req)
        except HTTPError as error:
            # if we get a http error, we still process the status code, since a
            # later step deals with different status codes differently.
            handler = error
        except (URLError, ConnectionResetError, TimeoutError):
            # if we get a ConnectionResetError, assume something has gone wrong
            # with the server. Later code will wait before retrying.
            return '', 500, headers
        
        status_code = handler.getcode()
        response = handler.read()
        if IS_PYTHON3:
            response = response.decode("utf-8")
        
        # parse the headers into a key, value dictionary
        headers = dict(zip(map(str.lower, handler.headers.keys()), handler.headers.values()))
        
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        logging.warning("{}\t{}\t{}".format(now, status_code, url))
        
        return response, status_code, headers
    
    def ensembl_request(self, ext, headers):
        """ obtain sequence via the ensembl REST API
        """
        
        self.attempt += 1
        if self.attempt > 5:
            raise ValueError("too many attempts, figure out why its failing")
        
        response, status, requested_headers = self.open_url(self.server + ext, headers=headers)
        
        # we might end up passing too many simultaneous requests, or too many
        # requests per hour, just wait until the period is finished before
        # retrying
        if status == 429:
            if "retry-after" in requested_headers:
                time.sleep(float(requested_headers["retry-after"]))
            elif "x-ratelimit-reset" in requested_headers:
                time.sleep(int(requested_headers["x-ratelimit-reset"]))
            
            return self.ensembl_request(ext, headers)
        # retry after 30 seconds if we get service unavailable error
        elif status in [500, 503, 504]:
            time.sleep(30)
            return self.ensembl_request(ext, headers)
        elif status != 200:
            raise ValueError("Invalid Ensembl response: {} for {}.\nSubmitted URL was: {}{}\nheaders: {}\nresponse: {}".format(status, sequence_id, \
                    self.server, ext, requested_headers, response))
        
        # sometimes ensembl returns odd data. I don't know what it is, but the
        # json interpreter can't handle it. Rather than trying to catch it,
        # simply re-request the data
        if requested_headers["content-type"] == "application/json":
            try:
                json.loads(response)
            except ValueError:
                now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
                logging.warning("{}\t{}\t{}\t{}\t{}".format(now,
                    status, self.server + ext,
                    "cannot obtain json output"))
                return self.ensembl_request(ext, requested_headers)
        
        self.cache.cache_url_data(self.server + ext, response)
        
        return response
    
    def get_genes_for_hgnc_id(self, hgnc_symbol):
        """ obtain the ensembl gene IDs that correspond to a HGNC symbol
        """
        
        headers = {"content-type": "application/json"}
        
        # http://grch37.rest.ensembl.org/xrefs/symbol/homo_sapiens/KMT2A?content-type=application/json
        
        self.attempt = 0
        ext = "/xrefs/symbol/homo_sapiens/{}".format(hgnc_symbol)
        r = self.ensembl_request(ext, headers)
        
        genes = []
        for item in json.loads(r):
            if item["type"] == "gene":
                genes.append(item["id"])
        
        return genes
    
    def get_previous_symbol(self, hgnc_symbol):
        """ sometimes we get HGNC symbols that do not match the ensembl rest version
        that we are currently using. We can look for earlier HGNC symbols for
        the gene using the service at rest.genenames.org
        
        Args:
            hgnc_symbol: HGNC symbol for the gene (eg "MLL2")
        
        Returns:
            list of deprecated gene symbols (eg ["KMT2A"])
        """
        
        ensembl_server = self.server
        gene_names_server = "http://rest.genenames.org"
        
        self.server = gene_names_server
        headers = {"accept": "application/json"}
        ext = "/fetch/symbol/{}".format(hgnc_symbol)
        try:
            r = self.ensembl_request(ext, headers)
        finally:
            self.server = ensembl_server
        
        gene_json = json.loads(r)
        
        prev_gene = []
        docs = gene_json["response"]["docs"]
        
        # strip out any gene entries that have been invalidated
        docs = [ x for x in docs if x["status"] != "Entry Withdrawn"]
        
        if len(docs) == 0:
            pass
        elif len(docs) > 1:
            raise ValueError("{0} has more than one alternate symbol, which I haven't accounted for.".format(hgnc_symbol))
        elif "prev_symbol" in docs[0]:
            prev_gene = docs[0]["prev_symbol"]
        
        return prev_gene
        
    def get_transcript_ids_for_ensembl_gene_ids(self, gene_ids, hgnc_symbols):
        """ fetch the ensembl transcript IDs for a given ensembl gene ID.
        
        Args:
            gene_ids: list of Ensembl gene IDs for the gene
            hgnc_symbols: list of possible HGNC symbols for gene
        """
    
        chroms = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", \
             "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", \
              "X", "Y"}
        
        headers = {"content-type": "application/json"}
        
        transcript_ids = []
        for gene_id in gene_ids:
            self.attempt = 0
            ext = "/overlap/id/{}?feature=transcript".format(gene_id)
            r = self.ensembl_request(ext, headers)
            
            for item in json.loads(r):
                # ignore non-coding transcripts
                if item["biotype"] not in ["protein_coding", "polymorphic_pseudogene"]:
                    continue
                
                # ignore transcripts not on the standard chromosomes
                # (non-default chroms fail to map the known de novo variants
                # to the gene location
                if item["Parent"] != gene_id or item["seq_region_name"] not in \
                        chroms or \
                        all([symbol not in item["external_name"] for symbol in hgnc_symbols]):
                    continue
                transcript_ids.append(item["id"])
        
        return transcript_ids
    
    def get_genomic_seq_for_transcript(self, transcript_id, expand):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/sequence/id/{0}?type=genomic;expand_3prime={1};expand_5prime={1}".format(transcript_id, expand)
        r = self.ensembl_request(ext, headers)
        
        gene = json.loads(r)
        
        seq = gene["seq"]
        seq_id = gene["id"]
        
        if seq_id != transcript_id:
            raise ValueError("ensembl gave the wrong transcript")
        
        desc = gene["desc"].split(":")
        chrom = desc[2]
        start = int(desc[3]) + expand
        end = int(desc[4]) - expand
        strand_temp = int(desc[5])
        
        strand = "+"
        if strand_temp == -1:
            strand = "-"
        
        return (chrom, start, end, strand, seq)
    
    def get_cds_seq_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/id/{}?type=cds".format(transcript_id)
        
        return self.ensembl_request(ext,  headers)
    
    def get_protein_seq_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/id/{}?type=protein".format(transcript_id)
        
        return self.ensembl_request(ext, headers)
    
    def get_genomic_seq_for_region(self, chrom, start_pos, end_pos):
        """ obtain the sequence for a genomic region
        """
        
        headers = {"content-type": "text/plain"}
        
        self.attempt = 0
        ext = "/sequence/region/human/{}:{}..{}:1".format(chrom, start_pos, end_pos)
        
        return self.ensembl_request(ext, headers)
    
    def get_chrom_for_transcript(self, transcript_id, hgnc_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=gene".format(transcript_id)
        r =  self.ensembl_request(ext, headers)
        
        for gene in json.loads(r):
            if gene["external_name"] == hgnc_id:
                return gene["seq_region_name"]
        
        return None
    
    def get_exon_ranges_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=exon".format(transcript_id)
        r = self.ensembl_request(ext, headers)
        
        exon_ranges = []
        for exon in json.loads(r):
            if exon["Parent"] != transcript_id:
                continue
            
            start = exon["start"]
            end = exon["end"]
            
            exon_ranges.append((start, end))
        
        return exon_ranges
    
    def get_cds_ranges_for_transcript(self, transcript_id):
        """ obtain the sequence for a transcript from ensembl
        """
        
        headers = {"content-type": "application/json"}
        
        self.attempt = 0
        ext = "/overlap/id/{}?feature=cds".format(transcript_id)
        r = self.ensembl_request(ext, headers)
        
        cds_ranges = []
        for cds_range in json.loads(r):
            if cds_range["Parent"] != transcript_id:
                continue
            
            start = cds_range["start"]
            end = cds_range["end"]
            
            cds_ranges.append((start, end))
        
        return cds_ranges
    
    def rate_limit_ensembl_requests(self):
        """ limit ensembl requests to one per 0.067 s
        """
        
        current_time = time.time()
        diff_time = current_time - self.prior_time
        
        # sleep until the current rate limit period is finished
        if diff_time < self.rate_limit:
            time.sleep(self.rate_limit - diff_time)
        
        # set the access time to now, for later checks
        self.prior_time = time.time()
Ejemplo n.º 9
0
 def setUpClass(self):
     self.temp_dir = tempfile.mkdtemp()
     self.cache = EnsemblCache(self.temp_dir, "grch37")
     self.cache.set_ensembl_api_version("3.0.0")
Ejemplo n.º 10
0
 def run(self):
     cache = EnsemblCache(cache_dir, 'grch37')
     cache.set_ensembl_api_version('6.0')
     while self.counter > 0:
         write(cache)
         self.counter -= 1