def _POST_query(self, qs, scopes): _q = [] INT_FIELDS = set(['entrezgene', 'retired']) if not scopes: scopes = self.default_scopes for term in qs: #logging.debug("Term: {}".format(term)) if is_int(term) and set(scopes).intersection(INT_FIELDS): _q.extend([ '{}', json.dumps( self._POST_single_query( term, scopes=list(set(scopes).intersection(INT_FIELDS)))) ]) elif not is_int(term) and set(scopes).difference(INT_FIELDS): _q.extend([ '{}', json.dumps( self._POST_single_query( term, scopes=list(set(scopes).difference(INT_FIELDS)))) ]) else: _q.extend( ['{}', json.dumps(self._POST_single_query(term=None))]) return self._return_query_kwargs({'body': '\n'.join(_q)})
def select_species(self): import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading "dataset_names.txt.gz"...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = '/pub/metazoa/release-%s/mysql/metazoa_mart_%s/dataset_names.txt.gz' % ( self.release, self.release) ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') #load saved file self.logger.info('Parsing "dataset_names.txt.gz"...') species_li = tab2list(outfile, (0, 4, 5), header=0) species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li] species_li = [ x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li ] self.logger.info('Done.') finally: os.remove(outfile) pass import pprint self.logger.error(pprint.pformat(species_li)) return species_li
def get_all_species(self): import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading "species.txt.gz"...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = '/pub/release-%s/mysql/ensembl_production_%s/species.txt.gz' % (self.release, self.release) ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') #load saved file self.logger.info('Parsing "species.txt.gz"...') species_li = tab2list(outfile, (1, 2, 7), header=0) # db_name,common_name,taxid species_li = [x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li] # as of ensembl 87, there are also mouse strains. keep only the "original" one species_li = [s for s in species_li if not s[0].startswith("mus_musculus_")] self.logger.info('Done.') finally: os.remove(outfile) pass return species_li
def _select_species(self): """ Return a list of tuple containing species to download data for. [(species_name1, common_name1, taxid1),(species_name2, common_name2, taxid2), ...] """ import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading Species List...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = self.get_species_file() ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') # load saved file self.logger.info('Loading Species List...') species_li = tab2list(outfile, (0, 4, 5), header=0) species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li] species_li = [ x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li] self.logger.info('Done.') finally: os.remove(outfile) import pprint self.logger.debug('\n %s', pprint.pformat(species_li)) return species_li
def _dis_max_query(self, q): # remove '"' and '\' from q, they will break json decoder. q = q.replace('"', '').replace('\\', '') _query = { "dis_max": { "tie_breaker": 0, "boost": 1, "queries": [ { "function_score": { "query": { "match": { "symbol": { "query": "%(q)s", "analyzer": "whitespace_lowercase" } }, }, "weight": 5 } }, { "function_score": { "query": { # This makes phrase match of "cyclin-dependent # kinase 2" appears first "match_phrase": { "name": "%(q)s" }, }, "weight": 4 } }, { "function_score": { "query": { "match": { "name": { "query": "%(q)s", "operator": "and", "analyzer": "whitespace_lowercase" } }, }, "weight": 3 } }, { "function_score": { "query": { "match": { "unigene": { "query": "%(q)s", "analyzer": "string_lowercase" } } }, "weight": 1.1 } }, { "function_score": { "query": { "multi_match": { "query": "%(q)s", "fields": [ 'refseq.rna', 'refseq.protein', 'accession.rna', 'accession.protein' ], "operator": "or" } }, "weight": 1.1 } }, { "function_score": { "query": { "match": { "go": { "query": "%(q)s", "analyzer": "string_lowercase" } } }, "weight": 1.1 } }, # { # "custom_boost_factor": { # "query" : { # "match" : { "_all" : { # "query": "%(q)s", # "analyzer": "whitespace_lowercase" # } # }, # }, # "boost_factor": 1 # } # }, { "function_score": { "query": { "query_string": { "query": "%(q)s", "default_operator": "AND", "auto_generate_phrase_queries": True }, }, "weight": 1 } }, ] } } _query = json.dumps(_query) _query = json.loads(_query % {'q': q}) if is_int(q): _query['dis_max']['queries'] = [] _query['dis_max']['queries'].insert( 0, { "function_score": { "query": { "term": { "entrezgene": int(q) }, }, "weight": 8 } }) return _query
def dismax(q): _query = { "tie_breaker": 0, "boost": 1, "queries": [ { "function_score": { "query": { "match": { "symbol": { "query": q, "analyzer": "whitespace_lowercase" } }, }, "weight": 5 } }, { "function_score": { "query": { # This makes phrase match of "cyclin-dependent # kinase 2" appears first "match_phrase": { "name": q }, }, "weight": 4 } }, { "function_score": { "query": { "match": { "name": { "query": q, "operator": "and", "analyzer": "whitespace_lowercase" } }, }, "weight": 3 } }, { "function_score": { "query": { "match": { "unigene": { "query": q, "analyzer": "string_lowercase" } } }, "weight": 1.1 } }, { "function_score": { "query": { "multi_match": { "query": q, "fields": [ 'refseq.rna', 'refseq.protein', 'accession.rna', 'accession.protein' ], "operator": "or" } }, "weight": 1.1 } }, { "function_score": { "query": { "match": { "go": { "query": q, "analyzer": "string_lowercase" } } }, "weight": 1.1 } }, { "function_score": { "query": { "query_string": { "query": q, "default_operator": "AND", "auto_generate_phrase_queries": True }, }, "weight": 1 } } ] } if is_int(q): _query['queries'] = [{ "function_score": { "query": { "term": { "entrezgene": int(q) }, }, "weight": 8 } }] return {"query": {"dis_max": _query}}