def teste4(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get(modules[0])) compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',..... pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',...... module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate'] return pathway
def __init__(self, gene_lists, taxon, dataframe, kegg_organism=None, enrichment_params={ "padj": 0.05, "log2_fc": 3, "max_entries": 3000, "kegg_background": None, "mapper": None, "preload_directory": None, 'plot_compute_levels': False, 'plot_logx': True }, go_only=False, kegg_only=False, command="" ): """.. rubric:: constructor """ super().__init__() self.title = "Enrichment" self.command = command #self.rnadiff_folder = rnadiff_folder self.gene_lists = gene_lists self.enrichment_params = enrichment_params self.data = dataframe self.taxon = taxon if taxon == 10090: self.organism = "mmu" elif taxon == 9606: self.organism = "hsa" else: if kegg_organism is None: logger.error("You must specify the kegg organism name if not human or mouse: eg., eco for ecoli") # figure out the organism from taxon raise NotImplementedError else: from bioservices import KEGG k = KEGG() k.organism = kegg_organism # validates the organism name self.organism = kegg_organism if self.enrichment_params['preload_directory']: pathname = self.enrichment_params['preload_directory'] if os.path.exists(pathname) is False: logger.error(f"{pathname} does not exist") sys.exit(1) #from sequana.rnadiff import RNADiffResults #self.rnadiff = RNADiffResults(self.rnadiff_folder) self.rnadiff = {} self.create_report_content(go_only=go_only, kegg_only=kegg_only) self.create_html("enrichment.html")
def teste5(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get("M00627")) module_name=dic["NAME"][0] reactions=dic["REACTION"] if "Pentose phosphate cycle" in module_name: print(module_name) else: print("haha")
def teste2(): s = KEGG() s.organism = "hsa" modules=s.moduleIds print(modules[3]) dic=s.parse(s.get(modules[3])) reactions=dic["REACTION"] dic_reac={} for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds
def teste6(): s = KEGG() s.organism = "hsa" modules=["M00001", "M00002", "M00013", "M00034"] dic_reac={} for mod in modules: dic=s.parse(s.get(mod)) reactions=dic["REACTION"] for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac
def download_pathway_ids(organism, cache=False): """ Query KEGG for a recent list of pathways for an organism. Parameters ---------- organism: str A KEGG organism code. For example 'hsa'. cache : bool, optional, default: False If True, results are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `list` List of str pathway identifiers. """ kegg = KEGG(cache=cache) kegg.organism = organism pathways = kegg.pathwayIds return pathways
def kegg(): k = KEGG() k.organismIds k.organism = "hsa" return k
def find_pathways_organism(cvDict, preDefList=[], writeGraphml=True, organism="hsa"): aliasDict, koDict, orgDict = {}, {}, { } # set up empty dictionaries for converting codes nc.parseKEGGdict('inputData/ko00001.keg', aliasDict, koDict) # parse the dictionary of ko codes try: # try to retrieve and parse the dictionary containing organism gene names to codes conversion url = urllib2.urlopen('http://rest.kegg.jp/list/' + organism) text = url.readlines() # reads KEGG dictionary of identifiers between numbers and actual protein names and saves it to a python dictionary for line in text: line_split = line.split('\t') k = line_split[0].split(':')[1] nameline = line_split[1].split(';') name = nameline[0] if ',' in name: nameline = name.split(',') name = nameline[0] for entry in range(1, len(nameline)): aliasDict[nameline[entry].strip()] = name.upper() orgDict[k] = name except: print('Could not get library: ' + organism) k = KEGG() # read KEGG from bioservices k.organism = organism minOverlap = 5 if len(preDefList) == 0: pathwayList = list(k.pathwayIds) else: pathwayList = list(preDefList) # set up a converter to retain only numbers from KEGG pathway codes allChars = string.maketrans('', '') noDigits = allChars.translate(allChars, string.digits) genes = set(cvDict.keys()) # find the list of genes included in dataset for x in pathwayList: x = x.replace("path:", "") code = str(x) code = code.translate(allChars, noDigits) # eliminate org letters coder = str('ko' + code) # add ko graph = nx.DiGraph() # open a graph object nc.uploadKEGGcodes([coder], graph, koDict) # get ko pathway coder = str(organism + code) # set up with org letters uploadKEGGcodes_org([coder], graph, orgDict, koDict, organism) # get org pathway # check to see if there is a connected component, simplify graph and print if so allNodes = set(graph.nodes()) test = len(allNodes.intersection(genes)) print("Pathway: ", x, " Overlap: ", test, " Edges: ", len(graph.edges())) if len( list(nx.connected_component_subgraphs(graph.to_undirected())) ) > 0: # if there is more than a 1 node connected component, run BONITA #nx.write_graphml(graph,coder+'_before.graphml') if len(genes.intersection(graph.nodes()) ) > minOverlap: # if there are 5 genes shared graph = simplifyNetworkpathwayAnalysis( graph, cvDict) # simplify graph to nodes in dataset nx.write_graphml(graph, coder + '.graphml') # write graph out nx.write_gpickle(graph, coder + '.gpickle') # write graph out print('nodes: ', str(len(graph.nodes())), ', edges:', str(len(graph.edges()))) print(graph.nodes()) if len(graph.nodes()) > 0: # save the removed nodes and omics data values for just those nodes in the particular pathway pathwaySampleList = [ {} for q in range(len(geneDict[list(graph.nodes())[0]])) ] for noder in graph.nodes(): for jn in range(len(pathwaySampleList)): pathwaySampleList[jn][noder] = geneDict[noder][jn] pickle.dump(pathwaySampleList, open(coder + "_sss.pickle", "wb"))
def pathway_to_dataframe(pathway_id, org='hsa', verbose=False, cache=False): """ Extract protein-protein interaction from KEGG pathway to a pandas DataFrame. NOTE: Interactions will be directionless. Parameters ---------- pathway_id : str Pathway identifier to parse into a dataframe. Example: 'path:hsa00010' org : str or None, optioanl, default: 'hsa' If supplied, filters out all interactions with identifiers that are not in the dictionary created from :func:`kegg_to_uniprot`. If None, all interactions are parsed regardless of mappability to UniProt. verbose : bool, optional, default: False If True, logs messages to stdout to inform of current progress. cache : bool, optional, default: False If True, HTTP responses are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `pd.DataFrame` DataFrame with 'source', 'target', 'label', 'pubmed', and 'experiment_type' columns. """ kegg = KEGG(cache=cache) kegg.organism = org kegg_to_up = kegg_to_uniprot(org, cache) res = kegg.parse_kgml_pathway(pathway_id) sources = [] targets = [] labels = [] if verbose: logger.info("Parsing pathway {}".format(pathway_id)) for rel in res['relations']: id1 = rel['entry1'] id2 = rel['entry2'] name1 = res['entries'][[x['id'] for x in res['entries']].index(id1)]['name'] name2 = res['entries'][[x['id'] for x in res['entries']].index(id2)]['name'] type1 = res['entries'][[x['id'] for x in res['entries']].index(id1)]['type'] type2 = res['entries'][[x['id'] for x in res['entries']].index(id2)]['type'] reaction_type = rel['name'].replace(' ', '-') link_type = rel['link'] if link_type not in links_to_include: continue if type1 not in types_to_include or type2 not in types_to_include: continue for a in name1.strip().split(' '): for b in name2.strip().split(' '): valid_db_a = (kegg.organism in a or 'ec' in a) valid_db_b = (kegg.organism in b or 'ec' in b) valid_db_a &= (a in kegg_to_up) valid_db_b &= (b in kegg_to_up) if valid_db_a and valid_db_b: sources.append(a) targets.append(b) labels.append(reaction_type) interactions = make_interaction_frame(sources, targets, labels) return interactions
sns.set(style='ticks', palette='pastel', color_codes=True) # ---- Import network network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t') network_proteins = set(network['protein1']).intersection(network['protein2']) # ---- Set-up UniProt uniprot = UniProt(cache=True) # ---- Set-up QuickGO bioservice quickgo = QuickGO(cache=True) # ---- Set-up KEGG bioservice kegg, kegg_parser = KEGG(cache=True), KEGGParser() kegg.organism = 'mmu' print '[INFO] KEGG service configured' kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds} print '[INFO] KEGG pathways extracted: ', len(kegg_pathways) # Convert KEGG pathways Gene Name to UniProt k2u = kegg.conv('uniprot', 'mmu') kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways} kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]} kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map} kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins} print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins)