def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'): mg = MyGeneInfo() mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False) if not os.path.exists(cache_path): os.makedirs(cache_path) gene_ids = [] for g in gene_names: gene_ids.append(g.split('|')[pipe_section]) gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False) try: pathways = gene_info['pathway'] except Exception as e: print(e) print('No pathways found with the selected genes:') print(gene_names) return None p_df = [] for idx, p in pathways.iteritems(): if not (p is np.nan or p != p): # print(p) path = dict(p) for key in path.keys(): if dbs is not None and key not in dbs: continue p_dict = path[key] if type(p_dict) is list: for k in p_dict: p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])]) else: p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])]) p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes']) p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list) p_df = p_df.reset_index() p_df.columns = ['id', 'name', 'db', 'genes'] pathway_size = [] for idx, p_row in p_df.iterrows(): if idx % 50 == 0: print('querying {}/{}'.format(idx, p_df.shape[0])) p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total'] pathway_size.append(p_size) p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()] p_df['size'] = pathway_size p_p = [] nb_slected_genes = len(gene_names) for idx, p_row in p_df.iterrows(): p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes)) p_df['p_value'] = p_p p_df = p_df[p_df['p_value'] <= p_cutoff] p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()] p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True) return p_df
def ensemble_to_symbol(ens): mg = MyGeneInfo() gene_info = mg.getgenes(geneids=ens, fields='symbol', as_dataframe=True, df_index=False) gene_info = gene_info.drop_duplicates('query').reset_index() gene_symbol = gene_info['symbol'].values gene_id = gene_info.symbol.str.cat([gene_info['query']], sep='|', na_rep='?').values return gene_symbol, gene_id