def prepare_client(self): """ Load the biothings_client for the class :return: """ if not self.client_name: raise NotImplementedError("Define client_name in subclass") if self.url: self._state["client"] = biothings_client.get_client( self.client_name, url=self.url) else: self._state["client"] = biothings_client.get_client( self.client_name) self.logger.info("Registering biothings_client {}".format( self.client_name))
def build_entrezgenes(result_list): """ Build a dictionary of entrezgenes for each uniprot :param uniprots: :return: """ # Build the set of Uniprot entries to query uniprots = set() for r in result_list: uniprots.add(r['interactor_a']['uniprot']) uniprots.add(r['interactor_b']['uniprot']) # Query MyGene.info mg = biothings_client.get_client('gene') qr = mg.querymany(list(uniprots), scopes='uniprot', species='human', as_generator=True) # Build the Entrezgene dictionary to return entrezgenes = {} for q in qr: if 'query' in q and 'entrezgene' in q: entrezgenes[q['query']] = q['entrezgene'] return entrezgenes
def setUp(self): self.mv = biothings_client.get_client("variant") self.query_list1 = [ 'chr1:g.866422C>T', 'chr1:g.876664G>A', 'chr1:g.69635G>C', 'chr1:g.69869T>A', 'chr1:g.881918G>A', 'chr1:g.865625G>A', 'chr1:g.69892T>C', 'chr1:g.879381C>T', 'chr1:g.878330C>G' ] self.query_list2 = [ 'rs374802787', 'rs1433078', 'rs1433115', 'rs377266517', 'rs587640013', 'rs137857980', 'rs199710579', 'rs186823979', 'rs2276240', 'rs372452565' ]
def get_uniprot2geneid_dict(uniprot_list): """This function returns the UniProt to NCBI gene ID dictionary from BioThings.""" mg = get_client('gene') r_df = mg.querymany(uniprot_list, scopes='uniprot', fields='entrezgene', as_dataframe=True) #print(r_df.head(2)) # get the dictionary from the dataframe p2g = r_df[['_id']].dropna() p2g_dict = {} for idx in p2g.index: uniprot = 'UniProt:' + str(idx) value = p2g.at[idx, '_id'] if not isinstance(value, str): for val in value.tolist(): ncbigene = 'NCBIGene:' + str(val) p2g_dict = utils.add_elem_dictionary2(p2g_dict, uniprot, ncbigene) else: ncbigene = 'NCBIGene:' + str(value) p2g_dict = utils.add_elem_dictionary2(p2g_dict, uniprot, ncbigene) return p2g_dict
def batch_query_mondo_from_doid(doid_list): """ convert a list of doids to a list of mondo ids Keyword arguments: doid_list: a list of doids """ mapping_dict = {} print('total doids: {}'.format(len(doid_list))) id_list = list(set(doid_list)) print('unique doids: {}'.format(len(id_list))) # initiate the mydisease.info python client client = get_client('disease') # the batch query can only handle 1000 ids at most a time for i in range(0, len(id_list), 1000): if i + 1000 <= len(id_list): batch = id_list[i:i + 1000] else: batch = id_list[i:] params = ','.join(batch) res = client.querymany(params, scopes="mondo.xrefs.doid", fields="_id") for _doc in res: if '_id' not in _doc: print('can not convert', _doc) mapping_dict[_doc[ 'query']] = _doc['_id'] if '_id' in _doc else _doc["query"] return mapping_dict
def setUp(self): self.mv = get_client("variant", url=os.environ.get("V_CLIENT_HOST", "http://myvariant.info/v1")) self.query_list1 = [ 'chr1:g.866422C>T', 'chr1:g.876664G>A', 'chr1:g.69635G>C', 'chr1:g.69869T>A', 'chr1:g.881918G>A', 'chr1:g.865625G>A', 'chr1:g.69892T>C', 'chr1:g.879381C>T', 'chr1:g.878330C>G' ] self.query_list2 = [ 'rs374802787', 'rs1433078', 'rs1433115', 'rs377266517', 'rs587640013', 'rs137857980', 'rs199710579', 'rs186823979', 'rs2276240', 'rs372452565' ]
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = get_client('gene') self.taxon = taxon if self.taxon == 'mouse': self.ont = 'mp' if self.taxon == 'human': self.ont = 'hp' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': ['blm:has phenotype'] } # Load the associated Biolink (Monarch) # phenotype ontology and annotation associations self.load_associations(taxon)
def convert_nm_ids_to_flybase(df1): # Remove all rows summing to 0 df11 = df1.loc[~(df1 == 0).all(axis=1)] # Use mygene to change refseq into flybase ID mg = mygene.MyGeneInfo() mg = get_client('gene') # Calling mygene to map NM_ IDs to Flybase names print("Calling mygene.") refseq_list = df11.index.tolist() df_geneIDs = mg.querymany( refseq_list, scopes="refseq", fields=["ensembl.gene", "uniprot", "symbol", "reporter"], species="fruitfly", as_dataframe=True) new_index_list = df_geneIDs["ensembl.gene"].tolist() # Plotting loss of gene IDs per replicate plot_NaN("df_merged", df_geneIDs) df11['flybase_id'] = new_index_list cols = list(df11.columns) cols = [cols[-1]] + cols[:-1] df11 = df11[cols] # Adding the flybase names to dataframe #df1 = df1.set_index([pd.Index(new_index_list)]) #df_converted=df1.reset_index().dropna().set_index("gene_id") #print("Convertion complete: \n", df_converted.head()) #print("df_merged lost ", len(df1)-len(df_converted)," thus ", # 1-(len(df_converted)/len(df1)),"% gene IDs.") return df11
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = get_client('gene') self.input_object = '' self.taxon = taxon self.ont = 'go' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', # TODO: also requires a blm curie? 'macromolecular machine to molecular activity association' ] } # Load the functional catalog of # GO ontology and annotation associations self.load_associations(taxon)
def gene_to_uniprot_from_mygene(id): """ Query MyGeneInfo with a gene and get its corresponding UniProt ID """ uniprot_ids = [] mg = get_client('gene') try: results = mg.query(id, fields='uniprot') if results['hits']: for hit in results['hits']: if 'Swiss-Prot' in hit['uniprot']: uniprot_id = hit['uniprot']['Swiss-Prot'] if not uniprot_id.startswith('UniProtKB'): uniprot_id = "UniProtKB:{}".format(uniprot_id) uniprot_ids.append(uniprot_id) else: trembl_ids = hit['uniprot']['TrEMBL'] for x in trembl_ids: if not x.startswith('UniProtKB'): x = "UniProtKB:{}".format(x) uniprot_ids.append(x) except ConnectionError: logging.error("ConnectionError while querying MyGeneInfo with {}".format(id)) return uniprot_ids
def __init__(self): self.blw = BioLinkApiWrapper(Config().get_biolink_api_endpoint()) self.mg = get_client('gene') self.meta = { 'taxon': 'human', 'limit': None, }
class MyVariantInfo(get_client('variant', instance=False)): '''This is the client for MyVariant.info web services. Example: >>> mv = MyVariantInfo() ''' pass
def setUp(self): self.mg = get_client("gene", url=os.environ.get('G_CLIENT_HOST', 'http://mygene.info/v3')) self.query_list1 = [ '1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at' ]
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = get_client('gene') self.taxon = taxon if self.taxon == 'mouse': self.ont = 'mp' if self.taxon == 'human': self.ont = 'hp'
def setUp(self): self.mc = biothings_client.get_client("chem") self.query_list1 = [ "QCYGXOCMWHSXSU-UHFFFAOYSA-N", "ADFOMBKCPIMCOO-BTVCFUMJSA-N", "DNUTZBZXLPWRJG-UHFFFAOYSA-N", "DROLRDZYPMOKLM-BIVLZKPYSA-N", "KPBZROQVTHLCDU-GOSISDBHSA-N", "UTUUIUQHGDRVPU-UHFFFAOYSA-K", "WZWDUEKBAIXVCC-IGHBBLSQSA-N", "IAJIIJBMBCZPSW-BDAKNGLRSA-N", "NASIOHFAYPRIAC-JTQLQIEISA-N", "VGWIQFDQAFSSKA-UHFFFAOYSA-N" ]
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = get_client('gene') self.input_object = '' self.taxon = taxon self.ont = 'go' # Load the functional catalog of # GO ontology and annotation associations self.load_associations(taxon)
def _get_client(self): """ Get Client - return a client appropriate for IDLookup This method must be defined in the child class. It is an artifact of multithreading. :return: """ if not self.client: self.client = biothings_client.get_client('gene') return self.client
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = get_client('gene') self.taxon = taxon if self.taxon == 'mouse': self.ont = 'mp' if self.taxon == 'human': self.ont = 'hp' # Load the associated Biolink (Monarch) # phenotype ontology and annotation associations self.load_associations(taxon)
def __init__(self, biothings_type: str, search_scope: Union[List[str], str], value_fields: Union[List[str], str]): super().__init__() self.client: biothings_client.BiothingClient = biothings_client.get_client( biothings_type) self.search_scope = search_scope if type(search_scope) is list else [ search_scope ] self.value_fields = value_fields if type(value_fields) is list else [ value_fields ]
def mygene(self, query=[], fr="accession_prot", to=["uniprot", "name"], as_dataframe=False, returnall=False): from biothings_client import get_client mg = get_client('gene') fr = self.__fieldTrans(field=fr, dict_type="mygene") to = [self.__fieldTrans(field=x, dict_type="mygene") for x in to] return mg.querymany(query, scopes=fr, fields=to, as_dataframe=as_dataframe, returnall=returnall)
def batch_query_entrez_from_locus_tag(locus_tag_list): """ convert a list of locus tags to list of entrez ids Keyword arguments: locus_tag_list: a list of locus tags """ mapping_dict = {} id_list = list(set(locus_tag_list)) # initiate the mydisease.info python client client = get_client('gene') params = ','.join(locus_tag_list) res = client.querymany(params, scopes="locus_tag", fields="_id") for _doc in res: if '_id' not in _doc: print('can not convert', _doc) mapping_dict[ _doc['query']] = _doc['_id'] if '_id' in _doc else _doc["query"] return mapping_dict
def get_ensembl_ids(entrez_ids): mg = get_client('gene') ensembl_id_raw = mg.querymany(entrez_ids, scopes='entrezgene', fields='ensembl.gene', species='human') translate, drop_list = {}, [] for ret in ensembl_id_raw: query = int(ret['query']) if "ensembl" in ret: ret = ret['ensembl'] if isinstance(ret, list): ret = ret[0] translate[query] = ret['gene'] else: drop_list.append(query) return translate, drop_list
def import_ortholog(csv_file, pattern, nthread): print("Parsing csv") if os.path.exists("%s/data" % os.path.dirname(csv_file)): path = "%s/data" % os.path.dirname(csv_file) else: path = os.path.dirname(os.path.dirname(csv_file)) file_name = os.path.basename(csv_file) os.makedirs("%s/csv/%s" % (path, pattern), exist_ok=True) index_file = '%s/csv/%s/index_%s_%s.csv' % (path, pattern, file_name[:-4], pattern) df = import_csv(csv_file) mt = get_client("taxon") print("Extracting %s phosphorylation site" % pattern) uniprot_id_list = [] if os.path.exists(index_file) and os.path.getsize(index_file) > 0: index_df = pd.read_csv(index_file, sep=';') uniprot_id_list = index_df["uniprotID"].value_counts().keys().tolist() print("Preparing queries") uniprot_to_convert = set(df["acc"].tolist()) - set(uniprot_id_list) resp = uniprotid_to_geneid(uniprot_to_convert) sub_df = df[df["acc"].isin(list(uniprot_to_convert))] sub_df.reset_index() with open(index_file, 'a+', newline='') as g: writer = csv.writer(g, delimiter=";") g.seek(0) first_char = g.read(1) if not first_char: writer.writerow(['uniprotID', 'geneID', 'taxID', 'metazoan', 'code', 'seq_in_window', 'pos_sites', 'clusterID', 'sequence']) group_acc_seq = sub_df.groupby(["acc"], observed=True) data_thread = np.array_split(group_acc_seq, nthread) thread_list = [] for data in data_thread: thread_list.append(fill_csv(data, uniprot_id_list, pattern, mt, path, writer, resp)) for thread in thread_list: thread.start() for thread in thread_list: thread.join() return index_file
def batch_query_hgvs_from_rsid(rsid_list): hgvs_rsid_dict = {} rsid_list = list(set(rsid_list)) variant_client = get_client('variant') for i in range(0, len(rsid_list), 1000): if i + 1000 <= len(rsid_list): batch = rsid_list[i:i + 1000] else: batch = rsid_list[i:] params = ','.join(batch) res = variant_client.getvariants(params, fields="_id") # print("currently processing {}th variant".format(i)) for _doc in res: if '_id' not in _doc: print('can not convert', _doc) hgvs_rsid_dict[_doc[ 'query']] = _doc['_id'] if '_id' in _doc else _doc["query"] return hgvs_rsid_dict
def convert_genes(genes, conv_from, conv_to, species): mg = get_client('gene') out = {} i = 0 tmp_fname = 'data/enst-tmp.pickle' if os.path.isfile(tmp_fname): out = pickle.load(open(tmp_fname, 'rb')) for gene in genes: i += 1 if gene in out: continue fields = ','.join([conv_to, 'refseq']) query = mg.query(gene, scopes=conv_from, fields=fields, species=species) out[gene] = None if 'hits' in query: if len(query['hits']) > 0: if 'symbol' in query['hits'][0]: out[gene] = query['hits'][0]['symbol'] if i % 1000 == 0: pickle.dump(out, open(tmp_fname, 'wb')) print(i, gene, out[gene]) pickle.dump(out, open(tmp_fname, 'wb')) mapping = out no_results = [] new_ids = [] for x in genes: new_ids.append(mapping[x]) if mapping[x] is None: no_results.append(x) # for x in out: # if conv_to in x.keys(): # mapping[x['query']] = x[conv_to] # else: # no_results.append(x) # for x in genes: # if x not in mapping.keys(): # no_results.append(x) # new_ids = [mapping[x] if x not in no_results else '-' for x in genes] # if '-' in new_ids: # new_ids.remove('-') return new_ids, mapping, no_results
def split_fasta(file): file_name = os.path.basename(file) path = os.path.dirname(os.path.dirname(file)) os.makedirs("%s/metazoa" % path, exist_ok=True) os.makedirs("%s/non_metazoa" % path, exist_ok=True) path2metazoa = "%s/metazoa/%s_metazoa.fasta" % ( path, os.path.splitext(file_name)[0]) path2nonmetazoa = "%s/non_metazoa/%s_non_metazoa.fasta" % ( path, os.path.splitext(file_name)[0]) if not os.path.exists(path2nonmetazoa) and not os.path.exists( path2metazoa): for record in SeqIO.parse(open(file), "fasta"): mt = get_client("taxon") position = str(record.id).find(":") taxID = record.id[:position] metazoa = is_metazoan(float(taxID), mt) f_out = path2metazoa if metazoa else path2nonmetazoa SeqIO.write([record], open(f_out, 'a'), "fasta") return {"metazoa": path2metazoa, "nonmetazoa": path2nonmetazoa}
def __init__(self): self.blw = BioLinkWrapper(Config().get_biolink_api_endpoint()) self.mg = get_client('gene') self.input_object = '' self.meta = { 'data_type': 'disease', 'input_type': { 'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM'], }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC' }, 'taxon': 'human', 'limit': None, 'source': 'Monarch Biolink', 'predicate': 'blm:gene associated with condition' }
def get_database_dict(database: pandas.DataFrame, gene_symbols: set) -> dict: try: database_entries = set(database['query'].to_list()) except KeyError: database_entries = set() query_gene_symbols = gene_symbols.difference(database_entries) if len(query_gene_symbols) > 0: gene_client = biothings_client.get_client("gene") gene_query_result = gene_client.querymany(query_gene_symbols, scopes='symbol', species='human', fields="entrezgene, ensembl") gene_df = pandas.json_normalize(gene_query_result) database = pandas.concat([database, gene_df]) database.to_csv(args.database) database = database[['query', 'entrezgene']] database = database.set_index('query') return database.to_dict()['entrezgene']
def uniprot_to_gene_from_mygene(id): """ Query MyGeneInfo with a UniProtKB id and get its corresponding HGNC gene """ gene_id = None if id.startswith('UniProtKB'): id = id.split(':', 1)[1] mg = get_client('gene') try: results = mg.query(id, fields='HGNC') if results['hits']: hit = results['hits'][0] gene_id = hit['HGNC'] if not gene_id.startswith('HGNC'): gene_id = 'HGNC:{}'.format(gene_id) except ConnectionError: logging.error("ConnectionError while querying MyGeneInfo with {}".format(id)) return [gene_id]
def __init__(self, iquery, fields, annotfunc, transcript=False): """ :param iquery: query input information :param ofields: annotation output :param annotfunc: function """ # 在这里添加判断iquery是list还是str,进而进行判断是否是多个list的请求 self._variant = set(['gene', 'variant', 'chem', 'disease', 'taxon']) self.annotfunc = annotfunc self.iquery = iquery self.ofields = fields if self.annotfunc not in self._variant: raise ClientAttributionError( "The function \"{}\" can't find in \"{}\"".format( self.annotfunc, '; '.join(self._variant))) self._myclient = get_client(self.annotfunc) self.query = self.__query()