def checkOrg(org): if not os.path.exists('./%s_pathway.txt' % org): a = KEGG() b = a.list('pathway', org) with open('%s_pathway.txt' % org, 'a', encoding='latin-1') as f: f.write(b) print('Finish writing org file') else: print('File already existed')
def extract_all(self): from bioservices import KEGG kegg = KEGG() pathway_list = filter(None, kegg.list("pathway/hsa").split("\n")) pathway_dict = {} for p in pathway_list: id = p.split("\t")[0] name = p.split("\t")[1] pathway_dict[id] = name return pathway_dict
class KeggPathwayEnrichment(): """DRAFT IN PROGRESS Current input is the output of the rnadiff analysis :: pe = PathwayEnrichment("rnadiff", "eco") pe.barplot(pe.enrichment['down']) # Save all deregulated pathways found by the enrichment: up = pe.save_significant_pathways("up") down = pe.save_significant_pathways("down") up.to_csv("kegg_pathway_up_regulated.csv") down.to_csv("kegg_pathway_down_regulated.csv") # transparent for ecoli. Set the organism to eco # For mus musculus, you will need convert the Gene Input IDs into # kegg identifiers so as to compare them. from bioservices import BioMart b = BioMart() datasets = b.get_datasets("ENSEMBL_MART_ENSEMBL") [x for x in datasets if 'mus' in x] -> one of interest is obviously mmusculus_gene_ensembl attributes = b.attributes(dataset='mmusculus_gene_ensembl') filters = b.filters(dataset='mmusculus_gene_ensembl') b.new_query() b.add_dataset_to_xml('mmusculus_gene_ensembl') b.add_attribute_to_xml('ensembl_gene_id') b.add_attribute_to_xml('go_id') b.add_attribute_to_xml('entrezgene_id') b.add_attribute_to_xml('mgi_id') b.add_attribute_to_xml('external_gene_name') xml = b.get_xml() res = b.query(xml) import pandas as pd df.columns=['ensembl','go', 'entrez', 'mgi', 'name'] df = df.set_index('ensembl') # name should be the name used by kegg """ def __init__(self, folder, organism, alpha=0.05, log2_fc=0, progress=True, mapper=None, background=None): print("DRAFT in progress") from bioservices import KEGG self.kegg = KEGG(cache=True) self.kegg.organism = organism self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc) # some clean up if "ID" in self.rnadiff.df.columns: self.rnadiff.df['ID'] = [ x.replace("gene:", "") for x in self.rnadiff.df['ID'] ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] for key, values in self.rnadiff.gene_lists.items(): self.rnadiff.gene_lists[key] = [ x.replace("gene:", "") for x in values ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] choices = list(self.rnadiff.gene_lists.keys()) if background: self.background = background else: self.background = len( self.kegg.list(self.kegg.organism).split("\n")) logger.info("Set number of genes to {}".format(self.background)) self._load_pathways(progress=progress) self.mapper = mapper try: self.compute_enrichment() except Exception: logger.critical("An error occured while computing enrichments") pass def _load_pathways(self, progress=True): # This is just loading all pathways once for all logger.info( "loading all pathways from KEGG. may take time the first time") self.pathways = {} from easydev import Progress pb = Progress(len(self.kegg.pathwayIds)) for i, ID in enumerate(self.kegg.pathwayIds): self.pathways[ID.replace("path:", "")] = self.kegg.parse(self.kegg.get(ID)) if progress: pb.animate(i + 1) # Some cleanup for ID in self.pathways.keys(): name = self.pathways[ID]['NAME'][0] self.pathways[ID]['NAME'] = name.split(" - ", 1)[0] # save gene sets self.gene_sets = {} for ID in self.pathways.keys(): res = self.pathways[ID] if "GENE" in res.keys(): results = [] # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco') # others reports genes as a dictionary id:'description' for geneID, description in res['GENE'].items(): if ";" in description: name = description.split(';')[0] else: name = geneID results.append(name) self.gene_sets[ID] = results else: print("SKIPPED (no genes) {}: {}".format(ID, res['NAME'])) # save all pathways info self.df_pathways = pd.DataFrame(self.pathways).T del self.df_pathways["ENTRY"] del self.df_pathways["REFERENCE"] go = [ x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None for x in self.df_pathways.DBLINKS ] self.df_pathways['GO'] = go del self.df_pathways["DBLINKS"] def plot_genesets_hist(self, bins=20): N = len(self.gene_sets.keys()) pylab.clf() pylab.hist([len(v) for k, v in self.gene_sets.items()], bins=bins, lw=1, ec="k") pylab.title("{} gene sets".format(N)) pylab.xlabel("Gene set sizes") pylab.grid(True) a, b = pylab.xlim() pylab.xlim([0, b]) def compute_enrichment(self, background=None): if background is None: background = self.background self.enrichment = {} self.enrichment['up'] = self._enrichr("up", background=background) self.enrichment['down'] = self._enrichr("down", background=background) self.enrichment['all'] = self._enrichr("all", background=background) def _enrichr(self, category, background=None, verbose=True): if background is None: background = self.background if isinstance(category, list): gene_list = category else: assert category in ['up', 'down', 'all'] gene_list = list(self.rnadiff.gene_lists[category]) if self.mapper is not None: logger.info("Input gene list of {} ids".format(len(gene_list))) #gene_list = [x.replace("gene:", "") for x in gene_list] identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates( ).values logger.info("Mapped gene list of {} ids".format(len(identifiers))) gene_list = list(identifiers) enr = gseapy.enrichr(gene_list=gene_list, gene_sets=self.gene_sets, verbose=verbose, background=background, outdir="test", no_plot=True) return enr def _get_final_df(self, df, cutoff=0.05, nmax=10): # takes the df and populate the name and size of the found pathways # we also sort by adjusted p-value # we keep adj p-value <=0.05 df = df.copy() df['name'] = [self.pathways[x]['NAME'] for x in df.Term] df['size'] = [len(x.split(";")) for x in df.Genes] df = df.sort_values("Adjusted P-value") df.reset_index(drop=True, inplace=True) df = df[df["Adjusted P-value"] <= cutoff] if len(df) < nmax: nmax = len(df) df = df.iloc[0:nmax] df = df.sort_values("Adjusted P-value", ascending=False) return df def barplot(self, enrich, cutoff=0.05, nmax=10): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value'])) pylab.yticks(range(len(df)), df.name) pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.grid(True) pylab.xlabel("Adjusted p-value (log10)") pylab.ylabel("Gene sets") a, b = pylab.xlim() pylab.xlim([0, b]) pylab.tight_layout() return df def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df def _get_summary_pathway(self, pathway_ID): genes = self.df_pathways.loc[pathway_ID]['GENE'] df_down = self.rnadiff.df.query( "padj<=0.05 and log2FoldChange<0").copy() df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy() #f_down = self.rnadiff.dr_gene_lists[self.comparison] logger.info("Total down-regulated: {}".format(len(df_down))) logger.info("Total up-regulated: {}".format(len(df_up))) mapper = {} for k, v in genes.items(): mapper[v.split(";")[0]] = k self.genes = genes self.df_down = df_down self.df_up = df_up summary_names = [] summary_keggids = [] summary_types = [] summary_pvalues = [] summary_fcs = [] if self.mapper is not None: if 'Name' not in df_down.columns: df_down['Name'] = df_down['ID'] Names = [] for index in df_down.index: Names.append(self.mapper.loc[index]['name'][0]) df_down['Name'] = Names if 'Name' not in df_up.columns: df_up['Name'] = df_up['ID'] Names = [] for index in df_up.index: Names.append(self.mapper.loc[index]['name'][0]) df_up['Name'] = Names for name, kegg_id in mapper.items(): summary_names.append(name) summary_keggids.append(kegg_id) if name.lower() in [x.lower() for x in df_down.Name]: pvalue = -pylab.log10( df_down.query("Name==@name").pvalue.values[0]) fc = df_down.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_pvalues.append(pvalue) summary_types.append("-") elif name.lower() in [x.lower() for x in df_up.Name]: pvalue = -pylab.log10( df_up.query("Name==@name").pvalue.values[0]) summary_pvalues.append(pvalue) fc = df_up.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_types.append("+") else: summary_pvalues.append(None) summary_fcs.append(None) summary_types.append("=") summary = pd.DataFrame({ "type": summary_types, "name": summary_names, "pvalue": summary_pvalues, "fc": summary_fcs, "keggid": summary_keggids }) summary['description'] = [ self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid ] return summary def _get_colors(self, summary): colors = {} for index, row in summary.iterrows(): pvalue = row['pvalue'] type_ = row['type'] kegg_id = row['keggid'] if type_ == "-": if pvalue > 0 and pvalue < 5: colors[kegg_id] = "#FF8C00,black" elif pvalue < 10: colors[kegg_id] = "#FF0000,black" else: colors[kegg_id] = "#B22222%2Cblack" elif type_ == "+": if pvalue > 0 and pvalue < 5: colors[kegg_id] = "#9ACD32,black" elif pvalue < 10: colors[kegg_id] = "#008000,black" else: colors[kegg_id] = "#006400,#000000" else: colors[kegg_id] = "grey,black" return colors def save_pathway(self, pathway_ID, scale=None, show=False, filename=None): summary = self._get_summary_pathway(pathway_ID) colors = self._get_colors(summary) logger.info("pathway {} total genes: {}".format( pathway_ID, len(summary))) count_up = len(summary.query("type == '+'")) count_down = len(summary.query("type == '-'")) logger.info("this pathway down-regulared genes: {}".format(count_down)) logger.info("this pathway up-regulated genes: {}".format(count_up)) url = "https://www.kegg.jp/kegg-bin/show_pathway" #dcolor = "white" --> does not work with the post requests unlike get # requests params = { "map": pathway_ID, "multi_query": "\r\n".join(["{} {}".format(k, v) for k, v in colors.items()]) } self.params = params import requests html_page = requests.post(url, data=params) self.tmp = html_page html_page = html_page.content.decode() links_to_png = [ x for x in html_page.split() if "png" in x and x.startswith("src") ] link_to_png = links_to_png[0].replace("src=", "").replace('"', '') r = requests.get("https://www.kegg.jp/{}".format(link_to_png)) if filename is None: filename = "{}.png".format(pathway_ID) with open(filename, "wb") as fout: fout.write(r.content) return summary def save_all_pathways(self): #pragma: no cover # This does not do any enrichment. Just save all pathways once for all # with useful information for ID in self.kegg.pathwayIds: self.save_pathway(ID) def save_significant_pathways(self, mode, cutoff=0.05, nmax=20, background=None): #pragma: no cover """mode should be up, down or all""" if background is None: background = self.background # select the relevant pathways df = self._enrichr(mode, background).results df = self._get_final_df(df, cutoff=cutoff, nmax=nmax) logger.warning("Found {} pathways to save".format(len(df))) if len(df) == nmax: logger.warning("Restricted pathways to {}".format(nmax)) logger.info("saving {} deregulated pathways".format(len(df))) summaries = {} # save them for ID in df['Term']: summary = self.save_pathway(ID, filename="{}_{}.png".format(ID, mode)) summaries[ID] = summary return summaries def find_pathways_by_gene(self, gene_name, match="exact"): """Returns pathways that contain the gene name ke.find_pathways_by_gene("ysgA") """ #First let us find the kegg ID genes = self.kegg.list(self.kegg.organism).strip().split("\n") keggid = [x.split("\t")[0].strip() for x in genes] gene_names = [x.split("\t")[1].split(";")[0].strip() for x in genes] self.keggid = keggid self.gene_names = gene_names candidates = [] for x, y in zip(keggid, gene_names): if match == "exact": if gene_name == y: candidates = x.split(":")[1] break else: if gene_name in y: candidates.append(x) if match != "exact": candidates = [x.split(":")[1] for x in candidates] logger.info("Found {} candidate(s): {}".format( len(candidates), candidates)) else: logger.info("Found {} in {}".format(gene_name, candidates)) paths = [] for key in self.pathways.keys(): if "GENE" in self.pathways[key]: if match == "exact": if candidates in self.pathways[key]['GENE'].keys(): paths.append(key) else: for candidate in candidates: if candidate in self.pathways[key]['GENE'].keys(): paths.append(key) return list(set(paths))