def getPathway(org, compare): temp = open('%s_pathway.txt' % org, "r").read() temp = re.findall(r'\d{5}', temp) text = read_doc('%s.txt' % compare) kegg_color = {} map = {} map_list = [] s = KEGG() final_com = {} for i in range(0, len(text) - 1, 2): newid = text[i] #'cpd:'+ text[i] kegg_color[newid] = text[i + 1] + ',' + text[i + 1] for id in kegg_color: a = s.get(id) dic = s.parse(a) try: if 'PATHWAY' in dic: map[id] = list(dic['PATHWAY'].keys()) map_list.extend(map[id]) except TypeError: print('Error:' + a) final_map = dict(Counter(map_list)) final_map = [x for x in final_map.items() if x[1] > 1] final_map = [x for x in final_map if x[0][3:] in temp] for pathway in final_map: newpath = pathway[0][3:] final_com[newpath] = [] for compound in map: if pathway[0] in map[compound]: final_com[newpath].append(compound) return kegg_color, final_com
def get_pathway(pathway): s = KEGG() data = s.get(pathway) if type(data) == int: return data dict_data = s.parse(data) path_info = (dict_data['NAME'], dict_data['GENE']) return path_info
def teste4(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get(modules[0])) compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',..... pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',...... module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate'] return pathway
def Get_Drug_IDs(Brite_ID): k = KEGG(verbose="False") k_id = k.get(Brite_ID) e = easyXML(k_id, 'utf=8') results = e.soup.findChildren("a") all_drug_ids = re.findall(r"(D\d{5})", str(results)) array = np.array(all_drug_ids) unique_drug_ids = np.unique(array) return unique_drug_ids
def teste5(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get("M00627")) module_name=dic["NAME"][0] reactions=dic["REACTION"] if "Pentose phosphate cycle" in module_name: print(module_name) else: print("haha")
def build_csv(self, filename=None, Nmax=None): """rebuild the entire dataframe (1hour) and stores as attribute :param Nmax: for testing """ logger.info("Retrieving the kegg organisms and their definitions") from bioservices import KEGG import pandas as pd k = KEGG() results = [] definition = [] for i, item in enumerate(k.organismIds): results.append(k.parse(k.get(f"gn:{item}"))['NAME']) definition.append(k.parse(k.get(f"gn:{item}"))['DEFINITION']) print(i, Nmax) if Nmax and i + 1 >= Nmax: break results = [x[0] for x in results] IDs = [x.split(",")[0] for x in results] taxon = [x.split(",")[-1] for x in results] names = [ x.split(",")[1].strip() if len(x.split(",")) == 3 else None for x in results ] df = pd.DataFrame({ 'ID': IDs, 'taxon': taxon, 'name': names, 'def': definition }) df = df.fillna("") df.columns = ['ID', 'taxon', 'shortname', 'definition'] df['definition'] = [x.lower() for x in df.definition] df['shortname'] = [x.lower() for x in df.shortname] self.df = df if filename: df.to_csv(filename)
def target_paths(target_dict): # Create KEGG Object k = KEGG(verbose=False) # Create empty dictionary to output information gene_path = {} # start iterator i = 0 # create list of targets target_names = list(target_dict.keys()) # Loop through genes for HSA in target_dict.values(): # Only use data where available if len(HSA) > 1: # get gene KEGG page page = k.get(HSA.lower()) # isolate pathway information d = k.parse(page) # write pathway information to output dictionary if "PATHWAY" in d.keys(): # create variable for pathways paths = d["PATHWAY"] # add pathway ids as list to gene name key gene_path[target_names[i]] = list(paths.keys()) # increase iterator i += 1 # add null value for no pathways else: gene_path[target_names[i]] = " " # increase iterator i += 1 # Skip null values else: gene_path[target_names[i]] = " " # increase iterator i += 1 return gene_path
def teste2(): s = KEGG() s.organism = "hsa" modules=s.moduleIds print(modules[3]) dic=s.parse(s.get(modules[3])) reactions=dic["REACTION"] dic_reac={} for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds
def teste6(): s = KEGG() s.organism = "hsa" modules=["M00001", "M00002", "M00013", "M00034"] dic_reac={} for mod in modules: dic=s.parse(s.get(mod)) reactions=dic["REACTION"] for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac
def kegg_get(*args): if not hasattr(kegg_get,"cache"): if os.path.isfile("kegg_get.cache"): kegg_get.cache = pickle.load(open("kegg_get.cache","rb")) else: kegg_get.cache = {} if args not in kegg_get.cache or kegg_get.cache[args] is None: k = KEGG() result = k.get(*args) kegg_get.cache[args] = result with open("kegg_get.cache~","wb") as f: pickle.dump(kegg_get.cache, f) os.rename("kegg_get.cache~", "kegg_get.cache") return result else: return kegg_get.cache[args]
def drug_dict(disease): # Create KEGG Object k = KEGG(verbose=False) # Create object for disease file dis = k.get(disease) # create dictionary of k.get() output with k.parse() # this is an extension of the KEGG class d = k.parse(dis) # Pull out Therapeutic drug information treatment_drugs = d["DRUG"] # Return dictionary of drugs return treatment_drugs
class KeggInfo: def __init__(self): self.k = KEGG() self.org = "lac" self.genelist = [] self.genedict = {} self.a = AnnotationTable() self.targetdict = self.a.analyze_sequences( ) # listed as sequence: list of genes for gene in self.a.genes: self.genelist.append(gene) #for gene in self.genelist: #self.get_info(gene) def get_info(self, gene): id = self.org + ":" + gene res = self.k.get(id) d = self.k.parse(res) ortho = "unknown" motif = "unknown" pfam = "unknown" definition = str(d['DEFINITION']) definition = definition[9:] if d.has_key('ORTHOLOGY'): ortho = str(d['ORTHOLOGY']) if d.has_key('MOTIF'): motif = d['MOTIF'] if motif.has_key('Pfam'): pfam = str(motif['Pfam']) else: pfam = "unknown" # print gene + ";" + definition + ";" + pfam self.genedict[gene] = definition print gene + " info obtained" def make_file(self): f = open("/Users/brianmendoza/Desktop/CRISPRs/lac_multi_data.txt", 'w') for sequence in self.targetdict: sequenceLine = sequence + ";" + str(len(self.targetdict[sequence])) for gene in self.targetdict[sequence]: sequenceLine += ";" + gene[0:-2] # self.genedict[gene] f.write(sequenceLine + "\n") f.close()
def drug_targets(drug_dic): # Create KEGG Object k = KEGG(verbose=False) # create empty list for drug IDs id_list = [] # Create empty dictionary do add gene information to target_gene_dic = {} # create dictionary to link gene(key) and theraputic drug(value) gene_drug = {} # locate each drug id and add to list for value in drug_dic.values(): id = re.findall(r"(D\d{5})", str(value)) id_list.append(id[0]) # Loop through drug IDs to gather information for drug_ID in id_list: # create object for drug information page = k.get(drug_ID) # create dictionary of drug information to isolate target information d = k.parse(page) # check for presence of target information if "TARGET" in d.keys(): # isolate target information targ = d["TARGET"] # Remove pathways no_paths_pre = targ.split(" PATHWAY") # count spaces to identify presence of info spaces = targ.count(" ") # create list of genes gene_list = no_paths_pre[0].split("\n ") # follow this if pathway section is present if spaces > 0: # loop through gene list for x in gene_list: # separate gene names and HSA ID's gene_split = x.split(" [") # remove extras from gene name y_split = gene_split[0].split(" ") # add gene information to output dictionary target_gene_dic[y_split[0]] = gene_split[1].strip("]") # add gene and drug to output dictionary gene_drug.setdefault(y_split[0], []).append(drug_ID) # if Gene doesn't have HSA# enter no value # also add gene to drug output dictionary else: target_gene_dic[no_paths_pre[0]] = "" for x in gene_list: # separate gene names and HSA ID's gene_split = x.split(" [") # remove extras from gene name y_split = gene_split[0].split(" ") # add gene and drug to output dictionary gene_drug.setdefault(y_split[0], []).append(drug_ID) else: pass return target_gene_dic, gene_drug
# M: total number of objects, # n: total number of type I objects # N: total number of type I objects drawn without replacement kegg_pathways_hyper = {p: (hypergeom.sf( len(subnetwork_proteins.intersection(kegg_pathways_proteins[p])), len(network_proteins), len(network_proteins.intersection(kegg_pathways_proteins[p])), len(subnetwork_proteins) ), len(subnetwork_proteins.intersection(kegg_pathways_proteins[p]))) for p in kegg_pathways_proteins if len(subnetwork_proteins.intersection(kegg_pathways_proteins[p])) > 0} print '[INFO] KEGG pathways enrichment done' kegg_pathways_hyper = DataFrame(kegg_pathways_hyper, index=['pvalue', 'intersection']).T kegg_pathways_hyper['adj.pvalue'] = multipletests(kegg_pathways_hyper['pvalue'], method='fdr_bh')[1] kegg_pathways_hyper = kegg_pathways_hyper.sort('adj.pvalue', ascending=False) kegg_pathways_hyper = kegg_pathways_hyper[kegg_pathways_hyper['adj.pvalue'] < 0.05] kegg_pathways_hyper['name'] = [re.findall('NAME\s*(.*) - Mus musculus\n?', kegg.get(p))[0] for p in kegg_pathways_hyper.index] kegg_pathways_hyper = kegg_pathways_hyper[kegg_pathways_hyper['adj.pvalue'] != 0.0] # Plot PPI network of the Kegg pathways for set_id, set_name in kegg_pathways_hyper['name'].tail(10).to_dict().items(): subnetwork_i.vs['label'] = [n.split('_')[0] if n in kegg_pathways_proteins[set_id] else '' for n in subnetwork_i.vs['name']] subnetwork_i.vs['shape'] = ['circle' for n in subnetwork_i.vs['name']] subnetwork_i.vs['color'] = [palette[hypothesis][1] if n in kegg_pathways_proteins[set_id] else palette[hypothesis][0] for n in subnetwork_i.vs['name']] subnetwork_i.vs['size'] = [17 if n in kegg_pathways_proteins[set_id] else 7 for n in subnetwork_i.vs['name']] igraph.plot( subnetwork_i, layout=layout, vertex_label_size=5, vertex_label_color='white', bbox=(0, 0, 360, 360),
def test_KEGGParser(): s = KEGG() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001")) d = s.parse(s.get('C15682')) assert d['SEQUENCE'][0]['TYPE'] == 'PK' assert d['SEQUENCE'][0][ 'GENE'] == "0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII [UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]" assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida" # issue #79 d = s.parse(s.get("C00395")) assert d["SEQUENCE"][0][ "GENE"] == '[1] 0-2 pcbAB [UP:P19787] [2] 0-2 pcbAB [UP:P27742]' assert d["SEQUENCE"][0][ "ORGANISM"] == '[1] Penicillium chrysogenum [2] Emericella nidulans (Aspergillus nidulans [GN:ani] )' assert d['SEQUENCE'][0]['SEQUENCE'] == '0 Aad 1 Cys 2 Val' assert d['SEQUENCE'][0]['TYPE'] == "NRP" # issue #85 bad_ids = [ 'D00136', 'D05177', 'D10223', 'H00434', 'H01656', 'H01663', 'T40092', 'T40093', 'T40094', 'T40098', 'T40100', 'T40103', 'T40107', 'T40123', 'T40129', 'T40136', 'T40139', 'T40145', 'T40147', 'T40148', 'T40149', 'T40161', 'T40162', 'T40180', 'T40182', 'T40183', 'T40195', 'T40196', 'T40197', 'T40208', 'T40209', 'T40210', 'T40213', 'T40215', 'T40219', 'T40224', 'T40226', 'T40233', 'T40236', 'T40238', 'T40242', 'T40248', 'T40249', 'T40255', 'T40256', 'T40258', 'T40284', 'T40285', 'T40286', 'T40287', 'T40288', 'T40289', 'T40295', 'T40303', 'T40307', 'T40314', 'T40318', 'T40326', 'K20201' ] for entry_id in bad_ids: entry = s.get(entry_id) assert not s.parse(entry) is entry
class KeggPathwayEnrichment(): """DRAFT IN PROGRESS Current input is the output of the rnadiff analysis :: pe = PathwayEnrichment("rnadiff", "eco") pe.barplot(pe.enrichment['down']) # Save all deregulated pathways found by the enrichment: up = pe.save_significant_pathways("up") down = pe.save_significant_pathways("down") up.to_csv("kegg_pathway_up_regulated.csv") down.to_csv("kegg_pathway_down_regulated.csv") # transparent for ecoli. Set the organism to eco # For mus musculus, you will need convert the Gene Input IDs into # kegg identifiers so as to compare them. from bioservices import BioMart b = BioMart() datasets = b.get_datasets("ENSEMBL_MART_ENSEMBL") [x for x in datasets if 'mus' in x] -> one of interest is obviously mmusculus_gene_ensembl attributes = b.attributes(dataset='mmusculus_gene_ensembl') filters = b.filters(dataset='mmusculus_gene_ensembl') b.new_query() b.add_dataset_to_xml('mmusculus_gene_ensembl') b.add_attribute_to_xml('ensembl_gene_id') b.add_attribute_to_xml('go_id') b.add_attribute_to_xml('entrezgene_id') b.add_attribute_to_xml('mgi_id') b.add_attribute_to_xml('external_gene_name') xml = b.get_xml() res = b.query(xml) import pandas as pd df.columns=['ensembl','go', 'entrez', 'mgi', 'name'] df = df.set_index('ensembl') # name should be the name used by kegg """ def __init__(self, folder, organism, alpha=0.05, log2_fc=0, progress=True, mapper=None, background=None): print("DRAFT in progress") from bioservices import KEGG self.kegg = KEGG(cache=True) self.kegg.organism = organism self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc) # some clean up if "ID" in self.rnadiff.df.columns: self.rnadiff.df['ID'] = [ x.replace("gene:", "") for x in self.rnadiff.df['ID'] ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] for key, values in self.rnadiff.gene_lists.items(): self.rnadiff.gene_lists[key] = [ x.replace("gene:", "") for x in values ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] choices = list(self.rnadiff.gene_lists.keys()) if background: self.background = background else: self.background = len( self.kegg.list(self.kegg.organism).split("\n")) logger.info("Set number of genes to {}".format(self.background)) self._load_pathways(progress=progress) self.mapper = mapper try: self.compute_enrichment() except Exception: logger.critical("An error occured while computing enrichments") pass def _load_pathways(self, progress=True): # This is just loading all pathways once for all logger.info( "loading all pathways from KEGG. may take time the first time") self.pathways = {} from easydev import Progress pb = Progress(len(self.kegg.pathwayIds)) for i, ID in enumerate(self.kegg.pathwayIds): self.pathways[ID.replace("path:", "")] = self.kegg.parse(self.kegg.get(ID)) if progress: pb.animate(i + 1) # Some cleanup for ID in self.pathways.keys(): name = self.pathways[ID]['NAME'][0] self.pathways[ID]['NAME'] = name.split(" - ", 1)[0] # save gene sets self.gene_sets = {} for ID in self.pathways.keys(): res = self.pathways[ID] if "GENE" in res.keys(): results = [] # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco') # others reports genes as a dictionary id:'description' for geneID, description in res['GENE'].items(): if ";" in description: name = description.split(';')[0] else: name = geneID results.append(name) self.gene_sets[ID] = results else: print("SKIPPED (no genes) {}: {}".format(ID, res['NAME'])) # save all pathways info self.df_pathways = pd.DataFrame(self.pathways).T del self.df_pathways["ENTRY"] del self.df_pathways["REFERENCE"] go = [ x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None for x in self.df_pathways.DBLINKS ] self.df_pathways['GO'] = go del self.df_pathways["DBLINKS"] def plot_genesets_hist(self, bins=20): N = len(self.gene_sets.keys()) pylab.clf() pylab.hist([len(v) for k, v in self.gene_sets.items()], bins=bins, lw=1, ec="k") pylab.title("{} gene sets".format(N)) pylab.xlabel("Gene set sizes") pylab.grid(True) a, b = pylab.xlim() pylab.xlim([0, b]) def compute_enrichment(self, background=None): if background is None: background = self.background self.enrichment = {} self.enrichment['up'] = self._enrichr("up", background=background) self.enrichment['down'] = self._enrichr("down", background=background) self.enrichment['all'] = self._enrichr("all", background=background) def _enrichr(self, category, background=None, verbose=True): if background is None: background = self.background if isinstance(category, list): gene_list = category else: assert category in ['up', 'down', 'all'] gene_list = list(self.rnadiff.gene_lists[category]) if self.mapper is not None: logger.info("Input gene list of {} ids".format(len(gene_list))) #gene_list = [x.replace("gene:", "") for x in gene_list] identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates( ).values logger.info("Mapped gene list of {} ids".format(len(identifiers))) gene_list = list(identifiers) enr = gseapy.enrichr(gene_list=gene_list, gene_sets=self.gene_sets, verbose=verbose, background=background, outdir="test", no_plot=True) return enr def _get_final_df(self, df, cutoff=0.05, nmax=10): # takes the df and populate the name and size of the found pathways # we also sort by adjusted p-value # we keep adj p-value <=0.05 df = df.copy() df['name'] = [self.pathways[x]['NAME'] for x in df.Term] df['size'] = [len(x.split(";")) for x in df.Genes] df = df.sort_values("Adjusted P-value") df.reset_index(drop=True, inplace=True) df = df[df["Adjusted P-value"] <= cutoff] if len(df) < nmax: nmax = len(df) df = df.iloc[0:nmax] df = df.sort_values("Adjusted P-value", ascending=False) return df def barplot(self, enrich, cutoff=0.05, nmax=10): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value'])) pylab.yticks(range(len(df)), df.name) pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.grid(True) pylab.xlabel("Adjusted p-value (log10)") pylab.ylabel("Gene sets") a, b = pylab.xlim() pylab.xlim([0, b]) pylab.tight_layout() return df def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df def _get_summary_pathway(self, pathway_ID): genes = self.df_pathways.loc[pathway_ID]['GENE'] df_down = self.rnadiff.df.query( "padj<=0.05 and log2FoldChange<0").copy() df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy() #f_down = self.rnadiff.dr_gene_lists[self.comparison] logger.info("Total down-regulated: {}".format(len(df_down))) logger.info("Total up-regulated: {}".format(len(df_up))) mapper = {} for k, v in genes.items(): mapper[v.split(";")[0]] = k self.genes = genes self.df_down = df_down self.df_up = df_up summary_names = [] summary_keggids = [] summary_types = [] summary_pvalues = [] summary_fcs = [] if self.mapper is not None: if 'Name' not in df_down.columns: df_down['Name'] = df_down['ID'] Names = [] for index in df_down.index: Names.append(self.mapper.loc[index]['name'][0]) df_down['Name'] = Names if 'Name' not in df_up.columns: df_up['Name'] = df_up['ID'] Names = [] for index in df_up.index: Names.append(self.mapper.loc[index]['name'][0]) df_up['Name'] = Names for name, kegg_id in mapper.items(): summary_names.append(name) summary_keggids.append(kegg_id) if name.lower() in [x.lower() for x in df_down.Name]: pvalue = -pylab.log10( df_down.query("Name==@name").pvalue.values[0]) fc = df_down.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_pvalues.append(pvalue) summary_types.append("-") elif name.lower() in [x.lower() for x in df_up.Name]: pvalue = -pylab.log10( df_up.query("Name==@name").pvalue.values[0]) summary_pvalues.append(pvalue) fc = df_up.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_types.append("+") else: summary_pvalues.append(None) summary_fcs.append(None) summary_types.append("=") summary = pd.DataFrame({ "type": summary_types, "name": summary_names, "pvalue": summary_pvalues, "fc": summary_fcs, "keggid": summary_keggids }) summary['description'] = [ self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid ] return summary def _get_colors(self, summary): colors = {} for index, row in summary.iterrows(): pvalue = row['pvalue'] type_ = row['type'] kegg_id = row['keggid'] if type_ == "-": if pvalue > 0 and pvalue < 5: colors[kegg_id] = "#FF8C00,black" elif pvalue < 10: colors[kegg_id] = "#FF0000,black" else: colors[kegg_id] = "#B22222%2Cblack" elif type_ == "+": if pvalue > 0 and pvalue < 5: colors[kegg_id] = "#9ACD32,black" elif pvalue < 10: colors[kegg_id] = "#008000,black" else: colors[kegg_id] = "#006400,#000000" else: colors[kegg_id] = "grey,black" return colors def save_pathway(self, pathway_ID, scale=None, show=False, filename=None): summary = self._get_summary_pathway(pathway_ID) colors = self._get_colors(summary) logger.info("pathway {} total genes: {}".format( pathway_ID, len(summary))) count_up = len(summary.query("type == '+'")) count_down = len(summary.query("type == '-'")) logger.info("this pathway down-regulared genes: {}".format(count_down)) logger.info("this pathway up-regulated genes: {}".format(count_up)) url = "https://www.kegg.jp/kegg-bin/show_pathway" #dcolor = "white" --> does not work with the post requests unlike get # requests params = { "map": pathway_ID, "multi_query": "\r\n".join(["{} {}".format(k, v) for k, v in colors.items()]) } self.params = params import requests html_page = requests.post(url, data=params) self.tmp = html_page html_page = html_page.content.decode() links_to_png = [ x for x in html_page.split() if "png" in x and x.startswith("src") ] link_to_png = links_to_png[0].replace("src=", "").replace('"', '') r = requests.get("https://www.kegg.jp/{}".format(link_to_png)) if filename is None: filename = "{}.png".format(pathway_ID) with open(filename, "wb") as fout: fout.write(r.content) return summary def save_all_pathways(self): #pragma: no cover # This does not do any enrichment. Just save all pathways once for all # with useful information for ID in self.kegg.pathwayIds: self.save_pathway(ID) def save_significant_pathways(self, mode, cutoff=0.05, nmax=20, background=None): #pragma: no cover """mode should be up, down or all""" if background is None: background = self.background # select the relevant pathways df = self._enrichr(mode, background).results df = self._get_final_df(df, cutoff=cutoff, nmax=nmax) logger.warning("Found {} pathways to save".format(len(df))) if len(df) == nmax: logger.warning("Restricted pathways to {}".format(nmax)) logger.info("saving {} deregulated pathways".format(len(df))) summaries = {} # save them for ID in df['Term']: summary = self.save_pathway(ID, filename="{}_{}.png".format(ID, mode)) summaries[ID] = summary return summaries def find_pathways_by_gene(self, gene_name, match="exact"): """Returns pathways that contain the gene name ke.find_pathways_by_gene("ysgA") """ #First let us find the kegg ID genes = self.kegg.list(self.kegg.organism).strip().split("\n") keggid = [x.split("\t")[0].strip() for x in genes] gene_names = [x.split("\t")[1].split(";")[0].strip() for x in genes] self.keggid = keggid self.gene_names = gene_names candidates = [] for x, y in zip(keggid, gene_names): if match == "exact": if gene_name == y: candidates = x.split(":")[1] break else: if gene_name in y: candidates.append(x) if match != "exact": candidates = [x.split(":")[1] for x in candidates] logger.info("Found {} candidate(s): {}".format( len(candidates), candidates)) else: logger.info("Found {} in {}".format(gene_name, candidates)) paths = [] for key in self.pathways.keys(): if "GENE" in self.pathways[key]: if match == "exact": if candidates in self.pathways[key]['GENE'].keys(): paths.append(key) else: for candidate in candidates: if candidate in self.pathways[key]['GENE'].keys(): paths.append(key) return list(set(paths))
sita = ab_dict[gene][-1] sita_q = 'ncbi-proteinid:{g}'.format(g=sita[:-2]) if sita_q in convDb: ab_dict[gene].append(convDb[sita_q]) annotated.append(gene) else: no_joy_for_sita.append(sita) counter = 0 # get kegg pathways for gene in annotated: counter += 1 if (len(ab_dict[gene])) == 4: call, conf, sita, kSita = ab_dict[gene] keggObj = s.get(kSita) keggParse = s.parse(keggObj) ko = [] if 'ORTHOLOGY' in keggParse.keys(): ko = list(keggParse['ORTHOLOGY'].keys()) else: ko = ['None'] assert len(ko) == 1, '{ko:{k}\ngene:{g}'.format(ko=ko, g=gene) ab_dict[gene].append(ko[0]) if (counter % 1000) == 0: print('Finshed:{c}'.format(c=counter)) f = open('/home/ndh0004/Documents/keggPthCor/gene_dictv2.pckl', 'wb') pickle.dump(ab_dict, f) f.close()
""" Created on Thu May 01 20:57:44 2018 @author: Tina Lai """ #TARGETS Quiz Instrutions #Based on the KEGG id ("br:br08329") generate a list of target genes and their pathways. from bioservices import KEGG from bioservices import easyXML import re import numpy as np import os k = KEGG(verbose="False") k_id = k.get("br:br08329") #This pulls up the KEGG brite id file #To parse through the file, I will create a easy xml file e = easyXML(k_id, 'utf=8') # The drug IDs are tagged with "<a" so I will use e soup find children to parse out lines tagged with "<a" results = e.soup.findChildren("a") #This is using the regular expression findall to find the drug IDS, however it pulls all drug ids mentioned in lines tagged with "a", even redundant drug ID tags. all_drug_ids = re.findall(r"(D\d{5})", str(results)) #I will use numpy to parse out unique drug ids from the list. #First I need to convert the list into an array array = np.array(all_drug_ids) #then using numpy's unique function, find unique genes. unique_drug_ids = np.unique(array)
class MetabolicNetwork(MyGraph): def __init__(self, modules, organism="hsa"): MyGraph.__init__(self,{}) self.gr=MyGraph() self.modules=modules self.s = KEGG() self.s.organism = organism # H**o sapiens as default def __kegg_dic(self): if type(self.modules)!=list: self.modules=self.s.moduleIds dic_reac={} for mod in self.modules: try: dic=self.s.parse(self.s.get(mod)) reactions=dic['REACTION'] for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string except KeyError: pass return dic_reac #it gives a dictionary with reactionsID as keys and a list of compoundsID # 'R01015': ['C00111', '->', 'C00118'] # 'R01070': ['C05378', '->', 'C00111', '+', 'C00118'] def c_c_graph(self):### comp-comp dic_reac=self.__kegg_dic() gr=self.gr for reac in dic_reac: comp=dic_reac[reac] c=0 if comp[c+1]=="+": try: comp[c+5]=="+" s2="+".join([str(comp[c+4]), str(comp[c+6])]) s3="+".join([str(comp[c]), str(comp[c+2])]) gr.addEdge(s3,s2) except IndexError: s="+".join([str(comp[c]), str(comp[c+2])]) gr.addEdge(s,comp[c+4]) elif comp[c+1]=="->": try: comp[c+3]=="+" s="+".join([str(comp[c+2]), str(comp[c+4])]) gr.addEdge(comp[c],s) except IndexError: gr.addEdge(comp[c],comp[c+2]) return gr.printGraph() def r_r_graph(self):### reac-reac dic_reac=self.__kegg_dic() gr=self.gr for k, v in dic_reac.items(): for r, m in dic_reac.items(): if v[len(v)-2] == "->": if v[len(v)-1]==m[0]: gr.addEdge(k, r) else: s="+".join([str(v[len(v)-3]), str(v[len(v)-1])]) try: s2="+".join([str(m[0]), str(m[2])]) if s == s2: gr.addEdge(k, r) except IndexError: pass return gr.printGraph() def r_c_graph(self):### reac-comp dic_reac=self.__kegg_dic() gr=self.gr for k, v in dic_reac.items(): for r, m in dic_reac.items(): if v[len(v)-2] == "->": if v[len(v)-1]==m[0]: sv="".join(v) sm="".join(m) gr.addEdge(k, sv) gr.addEdge(sv, r) gr.addEdge(r, sm) else: s="+".join([str(v[len(v)-3]), str(v[len(v)-1])]) try: s2="+".join([str(m[0]), str(m[2])]) if s == s2: sv="".join(v) sm="".join(m) gr.addEdge(k, sv) gr.addEdge(sv, r) gr.addEdge(r, sm) except IndexError: pass return gr.printGraph() def modules_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: dic=self.s.parse(self.s.get(i)) name=dic["NAME"][0]#['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate'] s="-".join([i,name]) print("\n".join([s])) def compounds_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: print(i) dic=self.s.parse(self.s.get(i)) comps=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',..... for key in comps.keys(): s="-".join([key,comps[key]]) print("\n".join([s])) def pathway_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: dic=self.s.parse(self.s.get(i)) pathway=dic["PATHWAY"]#{'map00010': 'Glycolysis / Gluconeogenesis',...... for key in pathway.keys(): s="-".join([key, pathway[key]]) print(s) def nodes_degree(self): gr=self.gr return gr.allDegrees() def clustering(self): gr=self.gr return gr.allClusteringCoefs() def connections(self, n1, n2): gr=self.gr return gr.distance(n1, n2)
def test_KEGGParser(): s = KEGG() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001")) d = s.parse(s.get('C15682')) assert d['SEQUENCE'][0]['TYPE'] == 'PK' assert d['SEQUENCE'][0]['GENE'] =="0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII[UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]" assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida"
def test_KEGGParser(): s = KEGG() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001"))