def query(self): mg = MyGeneInfo(url=self.base_url) # get the total q = mg.query(self.q, fields=self.fields, entrezonly=self.entrezonly) total = q['total'] # get the cursor q = mg.query(self.q, fields=self.fields, fetch_all=True, entrezonly=self.entrezonly) return q, total
def load_gene_set(self): for gene in self.input_object['input']: mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query( gene_curie, scopes=scope, species=self.input_object['parameters']['taxon'], fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(gene, e) self.gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] })
def get_summary(symbol): version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) ' \ 'Gecko/20071127 Firefox/2.0.0.11' mg = MyGeneInfo() try: entrez_id = mg.query('symbol:%s' % symbol, species='human')['hits'][0]['entrezgene'] except Exception as e: logging.info("Error with query: " + str(e)) return "Not found", "No entries found. (Entrez ID not found)" url = 'http://www.ncbi.nlm.nih.gov/gene/' + str(entrez_id) response = get(url, version) html_output = response.text search_string_start = '<dt>Summary</dt>' match_start = html_output.find(search_string_start) if match_start != -1: match_start += len(search_string_start) html_output = html_output[match_start:] search_string_end = '<dt>Orthologs</dt>' match_end = html_output.find(search_string_end) if match_end != -1: html_output = html_output[:match_end] # takes out the HTML tags extract_string = re.sub('<[^<]+?>', '', html_output) else: extract_string = "No entries found. (match_end = -1)" else: extract_string = "No entries found. (match_start = -1)" return entrez_id, extract_string
def get_pert_agent(noble_coder, pert_text, title): """ Extract perturbation agent Args: noble_coder: the execution path of Noble Coder pert_text: the perturbation text title: the title of the GSE Returns: the perturbation agent """ # Try to identify perturbation agent from perturbation text first, if unsuccessful, try with title pert_agent = run_noble_coder(pert_text, noble_coder) if pert_agent is None: pert_agent = run_noble_coder(title, noble_coder) # Extract gene symbol if pert_agent is not None: for special_char in SPECIAL_CHARS: pert_agent = pert_agent.replace(special_char, " ") pert_agent = pert_agent.replace("Superfamily", "") mg = MyGeneInfo() response = mg.query(pert_agent) if response["hits"]: pert_agent = response["hits"][0]["symbol"] return pert_agent
def load_gene_set(self, input_gene_set): annotated_gene_set = [] for gene in input_gene_set.get_input_curie_set(): mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query(gene_curie, scopes=scope, species=self.taxon, fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(__name__ + ".load_gene_set() Exception: ", gene, e) annotated_gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) return annotated_gene_set
def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'): mg = MyGeneInfo() mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False) if not os.path.exists(cache_path): os.makedirs(cache_path) gene_ids = [] for g in gene_names: gene_ids.append(g.split('|')[pipe_section]) gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False) try: pathways = gene_info['pathway'] except Exception as e: print(e) print('No pathways found with the selected genes:') print(gene_names) return None p_df = [] for idx, p in pathways.iteritems(): if not (p is np.nan or p != p): # print(p) path = dict(p) for key in path.keys(): if dbs is not None and key not in dbs: continue p_dict = path[key] if type(p_dict) is list: for k in p_dict: p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])]) else: p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])]) p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes']) p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list) p_df = p_df.reset_index() p_df.columns = ['id', 'name', 'db', 'genes'] pathway_size = [] for idx, p_row in p_df.iterrows(): if idx % 50 == 0: print('querying {}/{}'.format(idx, p_df.shape[0])) p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total'] pathway_size.append(p_size) p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()] p_df['size'] = pathway_size p_p = [] nb_slected_genes = len(gene_names) for idx, p_row in p_df.iterrows(): p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes)) p_df['p_value'] = p_p p_df = p_df[p_df['p_value'] <= p_cutoff] p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()] p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True) return p_df
def get_mg_cursor(self, taxid, filter_f=None): # get a cursor to all mygene docs for a specific taxid # accepts a function that can be used to filter the gene cursor (returns True or False for each doc) mg = MyGeneInfo(url=self.base_url) # get the total q = mg.query(self.q, fields=self.fields, species=str(taxid), entrezonly=self.entrezonly) total = q['total'] # get the cursor q = mg.query(self.q, fields=self.fields, species=str(taxid), fetch_all=True, entrezonly=self.entrezonly) if filter_f: q = filter(filter_f, q) return q, total
def get_uid(name): mg = MyGeneInfo() res = mg.query(name, scopes='symbol, alias', fields='uniprot, symbol', species='human') symbol = [] uid = [] for hit in res['hits']: try: uid.append(hit['uniprot']['Swiss-Prot']) symbol.append(hit['symbol']) except KeyError: uid.append('unable to retrieve') dict = {s: i for s, i in zip(symbol, uid)} try: uid = dict[name] out = uid except KeyError: out = dict return out
def convert_gene_api(query): mg = MyGeneInfo() dic = {} out = float('nan') out_format = 'entrezgene' try: res = mg.query(query) except: res = {} res['hits'] = [] if len(res['hits']) > 0: for h in res['hits']: if h['taxid'] == 9606 and out_format in h.keys(): out = h[out_format] else: out = float('nan') dic[query] = out return (dic)
class FunctionalSimilarity(GenericSimilarity): def __init__(self, associations: AssociationSet = None): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.gene_set = [] self.input_object = '' self.ont = 'go' self.group = '' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] } def metadata(self): print("""Mod1A Functional Similarity metadata:""") pprint(self.meta) def load_input_object(self, input_object): self.input_object = input_object if self.input_object['parameters']['taxon'] == 'mouse': self.group = 'mouse' if self.input_object['parameters']['taxon'] == 'human': self.group = 'human' def load_associations(self): self.retrieve_associations(ont=self.ont, group=self.group) def load_gene_set(self): for gene in self.input_object['input']: mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query( gene_curie, scopes=scope, species=self.input_object['parameters']['taxon'], fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(gene, e) self.gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) def compute_similarity(self): group = self.input_object['parameters']['taxon'] lower_bound = float(self.input_object['parameters']['threshold']) results = self.compute_jaccard(self.gene_set, lower_bound) for result in results: if group == 'human': result['hit_id'] = self.symbol2hgnc(result['hit_symbol']) for gene in self.gene_set: if gene['sim_input_curie'] != result['input_id']: result['input_id'] = self.symbol2hgnc( result['input_symbol']) return results def symbol2hgnc(self, symbol): mg_hit = self.mg.query('symbol:{}'.format(symbol), fields='HGNC,symbol,taxon', species='human', entrezonly=True) if mg_hit['total'] == 1: return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
class FunctionalSimilarity(GenericSimilarity): def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.input_object = '' self.taxon = taxon self.ont = 'go' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] } # Load the functional catalog of # GO ontology and annotation associations self.load_associations(taxon) def metadata(self): print("""Mod1A Functional Similarity metadata:""") pprint(self.meta) def load_gene_set(self, input_gene_set): annotated_gene_set = [] for gene in input_gene_set.get_input_curie_set(): mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query(gene_curie, scopes=scope, species=self.taxon, fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(__name__ + ".load_gene_set() Exception: ", gene, e) annotated_gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) return annotated_gene_set def compute_similarity(self, annotated_gene_set, threshold): lower_bound = float(threshold) results = self.compute_jaccard(annotated_gene_set, lower_bound) for result in results: if self.taxon == 'human': result['hit_id'] = self.symbol2hgnc(result['hit_symbol']) for gene in annotated_gene_set: if gene['sim_input_curie'] != result['input_id']: result['input_id'] = self.symbol2hgnc( result['input_symbol']) return results def symbol2hgnc(self, symbol): mg_hit = self.mg.query('symbol:{}'.format(symbol), fields='HGNC,symbol,taxon', species='human', entrezonly=True) if mg_hit['total'] == 1: return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])