def test_parser_roundtrip(self): """Download a KEGG pathway, write local KGML and check roundtrip.""" with kegg_get("ko00680", "kgml") as remote_handle: pathway = KGML_parser.read(remote_handle) with io.StringIO(pathway.get_KGML()) as local_handle: roundtrip = KGML_parser.read(local_handle) self.assertEqual(pathway.name, roundtrip.name) self.assertEqual(len(pathway.relations), len(roundtrip.relations))
def main(): query = sys.argv[1].replace(" ", "+") result = KEGG_REST.kegg_find('PATHWAY', query) result_txt = result.read().split('\n') if len(result_txt) == 1: print("Search found no results") return choice = 0 if len(result_txt) > 2: print("More than 1 result:") for index, r in enumerate(result_txt): output = r.split("\t") if len(output) == 2: print(str(index) + "\t" + output[1]) choice = int(input("Which one? ")) identifier = result_txt[choice].split("\t")[0].strip() identifier = identifier.replace("map", "hsa") pathway_kgml = KEGG_REST.kegg_get(identifier, "kgml") pathway = KEGG_KGML_PARSER.read(pathway_kgml) for i in pathway.genes: print(i.name)
def plot_pathway(self, enriched_genes, pathway_id='hsa05322', figurename=None): # config figure name if not figurename: figurename = '%s.pdf' % pathway_id assert (figurename.endswith('.pdf')) # fetch pathway pathway = KGML_parser.read(kegg_get(pathway_id, "kgml")) # change color for pathway elements for entry in pathway.entries.values(): possible_gene_names = entry.graphics[0].name matched_name = gene_is_enriched(enriched_genes, possible_gene_names) if matched_name: entry.graphics[ 0].bgcolor = self.enriched_box_color #set box color entry.graphics[ 0].fgcolor = self.enriched_text_color # set text color entry.graphics[0].name = matched_name else: entry.graphics[0].bgcolor = self.non_enriched_box_color entry.graphics[0].fgcolor = self.non_enriched_text_color entry.graphics[0].name = entry.graphics[0].name.split(',')[0] canvas = KGMLCanvas(pathway, import_imagemap=True, fontsize=self.fontsize) canvas.draw(figurename) print('Drawn: ', figurename) return pathway
def test_parse_remote_pathway(self): """Download a KEGG pathway from the KEGG server and write KGML.""" # Download the KEGG ko03070 pathway as a filehandle h = kegg_get("ko03070", "kgml") pathway = KGML_parser.read(h) self.assertEqual(pathway.name, "path:ko03070") h.close()
def get_pathway_kgml(pathway_id='hsa04151'): print('Reading pathway:', pathway_id) pw_url = f'{HOST}/get/{pathway_id}/kgml' # get pathway data if not exists path = DATA_ROOT / f'{pathway_id}.kgml' if not path.exists(): print('Could not find hsa file getting from kegg:', pw_url) r = requests.get(pw_url) if not r.status_code == 200: print('Failed to get pathway:', path, r.text) sys.exit(-1) with open(path, 'w') as f: f.write(r.text) print('Saved KGML file at:', path) # parse data with open(path, 'r') as f: pathway = KGML_parser.read(f) print('\n', pathway, sep='') print(' entry:', len(pathway.entries)) print(' reaction:', len(pathway.reactions)) print(' relation:', len(pathway.relations)) entries, relations = prune_kgml(pathway) print('Finished reading:', pathway_id) print(' entry:', len(entries.keys()), 'relation:', len(relations), 'new_relation:', len([1 for r in relations if hasattr(r, '_pamogk')])) return entries, relations
def draw_kegg_map(map_id): """ Render a local PDF of a KEGG map with the passed map ID """ # Get the background image first pathway = KGML_parser.read(kegg_get(map_id, "kgml")) canvas = KGMLCanvas(pathway, import_imagemap=True) img_filename = "%s.pdf" % map_id canvas.draw(img_filename)
def load_pathway(self, pathway_ID, organism_ID = ""): ''' Downloads pathway kgml from KEGG and readis it :param pathway_ID: (str) - suffix Pathway ID, int part :param organism_ID: (str) - preffix Pathway ID, str part :return: (object) - pathway kgml parsed ''' if not organism_ID: print(pathway_ID) pathway = kegg_get(str("ko"+pathway_ID), "kgml") return KGML_parser.read(pathway) else: try: pathway = kegg_get(str(organism_ID + pathway_ID), "kgml") return KGML_parser.read(pathway) except: print("Invalid IDs")
def colorCompounds(pathname, cpdlist, size=20): pathway = KGML_parser.read(kegg_get(pathname, "kgml")) for element in pathway.compounds: for graphic in element.graphics: if graphic.name in cpdlist: graphic.bgcolor = '#ff0000' graphic.width = size graphic.height = size canvas = KGMLCanvas(pathway, import_imagemap=True) canvas.draw("%s.pdf" % pathname)
def get_pathway_ko_association_table(): import os import MySQLdb from chlamdb.biosqldb import manipulate_biosqldb import urllib from Bio.KEGG.KGML import KGML_parser import re sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="enzyme") # name of the data base cursor = conn.cursor() sql = 'create table enzyme.pathway2ortholog_associations (pathway_id INT, node_id INT, ko_id varchar(200), ' \ ' index pathway_id(pathway_id), index node_id(node_id), index ko_id(ko_id));' cursor.execute(sql, ) conn.commit() sql2 = 'select pathway_name,pathway_id from enzyme.kegg_pathway' cursor.execute(sql2, ) pathway2pathway_id = manipulate_biosqldb.to_dict(cursor.fetchall()) for pathway in pathway2pathway_id: print(pathway) url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub( 'map', 'ko', pathway) print(url_template) try: f = urllib.request.urlopen(url_template) except: continue from Bio.Graphics import KGML_vis pathway_KGML = KGML_parser.read(f.read().decode("UTF-8")) # Loop over the orthologs in the pathway, and change the # background colour orthologs = [e for e in pathway_KGML.orthologs] for o in orthologs: ko_temp_list = list(set([i.rstrip() for i in o.name.split('ko:')])) ko_temp_list = filter(None, ko_temp_list) for ko in ko_temp_list: sql = 'insert into enzyme.pathway2ortholog_associations values(%s, %s, "%s")' % ( pathway2pathway_id[pathway], o.id, ko) cursor.execute(sql, ) conn.commit()
def get_network(org, opt='ec'): # Creating a Parser Object graph = k.KeggParser() # Store pathways that doesn't have EC numbers error = [] # Getting organism list1 = r.kegg_list('pathway', org).read() list1 = list1.split('\n') list1.remove('') print('Retrieving data from KEGG PATHWAY database. ' + str(time.ctime())) # For each path getting enzymes and reactions for path in list1: try: path = path.split('\t') # print (path[0]) graph.genes = parse.read(r.kegg_get(path[0], 'kgml')) graph.genes_default = parse.read( r.kegg_get("path:" + opt + path[0][-5:], 'kgml')) graph.path = path except Exception: error.append(path[0]) continue # print ("getting relations") graph.get_relations() # print ("getting reaction") graph.get_reactions() # print ('Unretrieved data',error) graph.ref = opt genes = 0 for i in graph.ec_org_target.items(): genes += len(i[1].split()) # print (graph.ec_org_target.keys()) # Building Graph graph.building_graph(2) return (graph)
def pathway_genes(pathway: str) -> set: """Returns genes for a given pathway in KEGG""" kgml = _get(pathway, form='kgml').text # Wrap text in a file handle for KGML parser f = io.StringIO(kgml) k = KGML_parser.read(f) genes = set() for gene in k.genes: for x in gene.name.split(): g = kegg_label_gene_names(x) genes = genes.union(g) return genes
def xml_to_txt(self): """ função que converte o ficheiro xml gerado com a função self.getKGML() para a conformação das reações dada nas aulas """ pathway = KGML_parser.read(open(self.name + '.xml', 'r')) reactions = pathway._reactions f = open(self.name + '.txt', 'w+') for r_id in reactions.keys(): reaction_names = reactions[r_id]._names substrates_ids_t = pathway._reactions[r_id]._substrates products_ids_t = pathway._reactions[r_id]._products while reaction_names != []: #para o mesmo id por vezes existe mais do que um nome de reação, por isso, este while serve para percorrer todos os nomes existentes substrates_ids = list(substrates_ids_t) products_ids = list(products_ids_t) name = reaction_names.pop() self.dictionary( name ) #para adicionar o nome da reação ao dicionário criado line = name + ': ' while substrates_ids: #mesmo fundamento do while anterior mas para os substratos sub = substrates_ids.pop() self.dictionary(pathway.entries[sub]._names[0]) if len(substrates_ids) > 0: line += str(pathway.entries[sub]._names[0]) + ' + ' else: line += str(pathway.entries[sub]._names[0]) + ' ' if pathway._reactions[ r_id].type == "reversible": #adicionar os simbolos definidos para reversivel / irreversivel line += '<=> ' else: line += '=> ' while products_ids: #mesmo fundamento dos whiles anteriores mas para os produtos prod = products_ids.pop() self.dictionary(pathway.entries[prod]._names[0]) if len(products_ids) > 0: line += str(pathway.entries[prod]._names[0]) + ' + ' else: line += str(pathway.entries[prod]._names[0]) + '\n' f.write(line) f.close()
def getPic(self, pidpath, pathwaydir): """ 输入文件pid.txt,输出文件夹pathways,作图 """ fs = gridfs.GridFS(self.mongodb) f = open(pidpath) if not os.path.exists(pathwaydir): os.makedirs(pathwaydir) for i in f: if i: i = i.strip('\n').split('\t') pid = i[0] koid = i[1].split(';') l = [] kgml_path = os.path.join(os.getcwd(), "pathway.kgml") png_path = os.path.join(os.getcwd(), "pathway.png") if os.path.exists(kgml_path) and os.path.exists(png_path): os.remove(kgml_path) os.remove(png_path) with open("pathway.kgml", "w+") as k, open("pathway.png", "w+") as p: result = self.png_coll.find_one({"pathway_id": pid}) if result: kgml_id = result['pathway_ko_kgml'] png_id = result['pathway_ko_png'] k.write(fs.get(kgml_id).read()) p.write(fs.get(png_id).read()) p_kgml = KGML_parser.read(open("pathway.kgml")) p_kgml.image = png_path for ko in koid: for degree in p_kgml.entries.values(): if re.search(ko, degree.name): l.append(degree.id) for n in l: for graphic in p_kgml.entries[n].graphics: graphic.fgcolor = '#CC0000' canvas = KGMLCanvas(p_kgml, import_imagemap=True) canvas.draw(pathwaydir + '/' + pid + '.pdf') print "getPic finished!!!"
def parseKGML(): nodestring = '"nodes":[\n' edgestring = '"edges":[\n' assignId = 0 for files in KGMLs: path = "kgml_files/" + files Maps = KGML_parser.read(open(path, 'r')) name = Maps.name description = Maps.title #retrieve list of compounds in each pathway KEGGcompounds = [] for i in range(len(Maps.compounds)): temp_compound_name = str(Maps.compounds[i].name).split()[0] if "cpd:" in temp_compound_name: KEGGcompounds.append(temp_compound_name) strKEGGcompounds = str(KEGGcompounds).replace("'", '"') #retrieve list of genes products in each pathway KEGGgeneproduct = [] for i in range(len(Maps.genes)): temp_gene_name = str(Maps.genes[i].name).split()[0] if "mmu:" in temp_gene_name: KEGGgeneproduct.append(temp_gene_name) strKEGGgeneproduct = str(KEGGgeneproduct).replace("'", '"') nodestring += '{"data":{ "id":"' + name + '", "name":"' + description + '","compounds":' + strKEGGcompounds + ',"gene products":' + strKEGGgeneproduct + '}},' lenMapmap = len(Maps.maps) for i in range(lenMapmap): entry = str(Maps.maps[i].name) if entry != name and "mmu" in entry: #nodestring+='{"data":{ "id":"' + entry + '", "name":"'+entry+'"}},' edgestring += '{"data":{ "id":"' + str( assignId ) + '", "source":"' + name + '","target":"' + entry + '"}},' assignId = assignId + 1 nodestring += "]," edgestring += "]" return nodestring, edgestring
def readKGML(kgml): return KGML_parser.parse(kgml).next()
def kgml_file_to_digraph(kgml_file): """Parse a KEGG KGML file and convert to a NetworkX directed graph""" fh = open(kgml_file, 'r') pw = KGML_parser.read(fh) digraph = pw2graph(pw) return (digraph)
def main(): # Expects name of pathway as argument # Get the KGML from KEGG query = sys.argv[1].replace(" ", "+") result = KEGG_REST.kegg_find('PATHWAY', query) result_txt = result.read().split('\n') if len(result_txt) == 1: print("Search found no results") return choice = 0 if len(result_txt) > 2: print("More than 1 result:") for index, r in enumerate(result_txt): output = r.split("\t") if len(output) == 2: print(str(index) + "\t" + output[1]) choice = int(input("Which one? ")) identifier = result_txt[choice].split("\t")[0].strip() identifier = identifier.replace("map", ORGANISM) pathway_kgml = KEGG_REST.kegg_get(identifier, "kgml") pathway = KEGG_KGML_PARSER.read(pathway_kgml) config = configparser.ConfigParser() config.read("server_config") if not "KGML2NEO4J" in config: print("Server config not found!") return username = config["KGML2NEO4J"]['username'] password = config["KGML2NEO4J"]['password'] server_uri = config["KGML2NEO4J"]['uri'] db = database(server_uri, username, password) db.run_query("MATCH (n) DETACH DELETE n") query = "CREATE " query_list = [ db.make_gene_query(pathway.genes), db.make_compound_query(pathway.compounds), db.make_reaction_query(pathway.reaction_entries), db.make_map_query(pathway.maps), db.make_relations_query(pathway.relations) ] for q in query_list: if len(q) > 0: query += q + "," query = query[:-1] db.run_query(query) # Merge matching nodes merge_query = """MATCH (n1),(n2) WHERE ANY (x IN n1.name WHERE x IN n2.name) and id(n1) < id(n2) WITH [n1,n2] as ns CALL apoc.refactor.mergeNodes(ns) YIELD node RETURN node""" db.run_query(merge_query)
# TODO: integrate relations involving compounds? if False and entry.type == "compound": return entry.id f_members = open(os.path.join(outdir, 'members.txt'), 'w') for filename in os.listdir(kgmldir): nodes = {} pathway_nodes = set() if not filename.endswith(".xml") or not filename.startswith("hsa"): continue kgid = filename[:-4] kgml_file = os.path.join(kgmldir, filename) f = open(kgml_file) print(filename) parsed = KGML_parser.read(f) f.close() pathway_name = parsed.title # Find and map components components = set() # Enter the parsed pathway info and extract components for k, entry in parsed.entries.items(): if entry.type != "gene": continue rnames = entry.name.split(" ") names = [ converter.handler.to_uniprot(converter.KEGG_IDX, u) for u in rnames ]
# Maps from the extracted node id (e.g. EGFR) to the Node Object nodes = set() # Map to prevent multiple creation of edges between two nodes uses concatenated node ids as keys (e.g. EGFR_C00010) # edgeDict = dict() # batch = WriteBatch(graph) for filename in files: url = args.data_dir + filename print ('Loading file ' + url) currentNodes = set() currentEdges = dict() with open(url) as f: pathway = KGML_parser.read(f, 'r') # Maps from the reaction id (e.g. rn:R05134) to the Node Objects that are part of this reaction reactionToNode = dict() # Maps from the internal pathway id (e.g. 23) to the compound id (e.g. C00010) compoundDict = dict() for gene in pathway.genes: for geneName in gene.name.split(' '): gene_id = geneName[4:] if gene_id not in nodes: gName = gene_id if gene_id in geneNames: gName = geneNames[gene_id] importer.add_node(['_Network_Node', 'Gene'], gene_id, {'name': gName, 'idType': 'ENTREZ', 'url': 'http://www.kegg.jp/dbget-bin/www_bget?hsa:' + gene_id})
kegg[keggID[0]].extend( [x for x in keggID[1:] if x not in kegg[keggID[0]]]) org = proteinMapping[currentGene] for ec in keggID[1:]: if org not in ecToGeneOrg: ecToGeneOrg[org] = defaultdict(list) if currentGene not in ecToGeneOrg[org][ec]: ecToGeneOrg[org][ec].append(currentGene) # process all found kegg pathways for k in kegg: print("Processing: {}".format(k)) stats[k] = defaultdict(int) processedIDs = set() # load current pathway pathway = KGML_parser.read(kegg_get("ko{}".format(k), "kgml")) # get information on EC numbers in kegg pathway for ec in kegg[k]: print(" EC: {}".format(ec)) if True: foundOrtho = False # query KEGG for ecInfo in kegg_get("ec:{}".format(ec)): ecInfoLabel = ecInfo[:12] if "ORTHOLOGY" in ecInfoLabel: foundOrtho = True KOToEC[ecInfo[12:18]].append(ec) # KOToGene[ecInfo[12:18]].extend(ecToGene[ec]) else: foundOrtho = foundOrtho and len(ecInfoLabel.strip()) == 0
def parser_xml(pathway_nodes_df, save=True): print('Executing function parser_xml:') pathwaylist = list(pathway_nodes_df[0]) # 获取 pathway 列表 enzymeDict = {} # 初始化结果字典 path2path = list() # 初始化 procress, allnum = 0, len(pathwaylist) # 显示进度 for pathwayname in pathwaylist: if procress % (allnum // 5) == 0: print( f"--- Parsing xml {procress}/{allnum} pathwayname: {pathwayname}" ) procress += 1 # 在线提取 pathwayname 下的 xml 文件 pathway2xml = KGML_parser.read(kegg_get(pathwayname, "kgml")) # 将 xml 内出现的所有 map 类型保存,并认为这些 map 与 pathwayname 有互作关系 path2path.extend([[pathwayname, maps.name] for maps in pathway2xml.maps if maps.name in pathwaylist]) # 设置空的 dataframe 存储 pathwayname 下 entry 与 entry 关系 relation2entry = pd.DataFrame(columns=('id1', 'id2')) genelist = pathway2xml.genes for i, gene in enumerate(pathway2xml.relations): if gene.entry1 in genelist and gene.entry2 in genelist: relation2entry.loc[i, :] = [ pathwayname + '_' + str(gene.entry1.id), pathwayname + '_' + str(gene.entry2.id) ] entry_nodes = list( set(relation2entry['id1'].tolist() + relation2entry['id2'].tolist())) id2gene = [(pathwayname + '_' + str(gene.id), gene.name) for gene in genelist] enzymeDict[pathwayname] = { 'entry_nodes': entry_nodes, 'entry_entry_edges': relation2entry, 'entry2gene': id2gene } print('--- Finish parsing xml') print('--- Processing data...') # 去重 path2path = pd.DataFrame(path2path, columns=('path1', 'path2')) rows = [ i for i in path2path.index if path2path.iat[i, 0] == path2path.iat[i, 1] ] path2path2 = path2path.drop(rows, axis=0) # 利用drop方法将含 path1=path2 的行删除 path2path2 = path2path2.drop_duplicates(['path1', 'path2'], keep='first') # 删除重复行 a = path2path2.apply(lambda x: str(sorted(x.tolist())), axis=1) pos = pd.DataFrame(a).duplicated() path2path_drop = path2path2.loc[-pos, :].reset_index() # 保存 if save: print('--- Saving data pathway2enzyme.pickle.txt') with open('data/pathway2enzyme.pickle.txt', 'wb') as file: pickle.dump(enzymeDict, file) print('--- Saving data pathway2pathway.csv') path2path_drop[['path1', 'path2']].to_csv('data/pathway2pathway.csv', header=0, index=0) return {'pathway2enzyme': enzymeDict, 'pathway2pathway': path2path_drop}
def gatherDetails(makeNclusters,trimPath,forRelatedness,folderName,CO_fromMATLAB,KO_Norm2Mean,Insitu_TPM_DIA,Insitu_TPM_DIN,Insitu_TPM_Oth): colLabel = ['nCpds','nGenes'] #starting with this is easiest - makes one list, no need to flatten for item in range(makeNclusters): colLabel.append('Km' + str(item) + '_cpd') colLabel.append('Km' + str(item) + '_gene') gatherCounts = pd.DataFrame(0, index = trimPath, columns = colLabel) #setup the strings to match first rnString = re.compile('(?:[rn:R])(\d+)$') #will return R00190 cpdString = re.compile('(?:[cpd:C])(\d+)$') #will return C00190 size = 20 #turns out I can increase the size of the compounds in the plots for kn in range(makeNclusters): fullSet = set(forRelatedness.KEGG) oneK = forRelatedness[forRelatedness.kmeans == kn] #get gene & transcript information for one Kmeans group getKm = 'Km' + str(kn) #check if the directories exist, one for pathway files directoryPDF = folderName + str(kn) + '/pathway_files' if not os.path.exists(directoryPDF): os.makedirs(directoryPDF) else: raise ValueError('Krista - be careful, this folder already exists') #check if the directories exist, one for reaction files directoryPNG = folderName + str(kn) + '/reaction_files' if not os.path.exists(directoryPNG): os.makedirs(directoryPNG) else: raise ValueError('Krista - be careful, this folder already exists') #check if the directories exist, one for species directorySpecies = folderName + str(kn) + '/species_files' if not os.path.exists(directorySpecies): os.makedirs(directorySpecies) else: raise ValueError('Krista - be careful, this folder already exists') for item in trimPath: #searching within one pathway at a time plotPathway = [] #gather up yes/no and will only plot if have linked genes/mtabs genes = getKfrom_ko(item) compounds = getCfrom_ko(item) gatherCounts.loc[item,'nCpds'] = len(compounds) gatherCounts.loc[item,'nGenes'] = len(genes) #have to track genes and compounds differently for the biopython plotting later on setG = set(genes) setC = set(compounds) setB = set(oneK.KEGG) intGenes = setG.intersection(setB) intCompounds = setC.intersection(setB) gatherCounts.loc[item,(getKm + '_gene')] = len(intGenes) gatherCounts.loc[item,(getKm + '_cpd')] = len(intCompounds) for gen in intGenes: #go through each gene...one at a time rnList = kegg_link('reaction',gen).read() #get the list of reactions for that gene #can have cases where there is a gene and no reaction (K02906 for example). This returns rnList = '\n' #since this is not actually empty...need a few way to filter those out test = '\n' if test != rnList: for line in rnList.rstrip().split('\n'): countCpd = [] countGene = [] m = rnString.search(line) #get the reaction number cpdList = kegg_link('cpd',m.group(0)).read() #now go get the compounds for that reaction #can have no compounds in a reaction (only glycans, begin with G, nothing I have matched) if len(cpdList) > 1: #will be true if cpdList includes compounds for line2 in cpdList.rstrip().split('\n'): m2 = cpdString.search(line2).group(0) #now that I have a compound, check if it is in intCompounds if m2 in intCompounds: countCpd.append(m2) countGene.append(gen) plotPathway.append('yes') ##Now, plot the PNG files (one for each reaction within a pathway) if len(countCpd) > 0: dayList = ['S1','S2','S3','S4','S5'] kData = pd.DataFrame(columns = dayList) for k in set(countGene): kData = kData.append(oneK.ix[k,dayList]) cData = pd.DataFrame(columns = dayList) for co in set(countCpd): #convert CO to RI, can have multiple options j = findRInumber(oneK,co) cData = cData.append(oneK.loc[j,dayList]) fig,ax = plt.subplots(1) cData.T.plot(color = 'k',ax=ax) kData.T.plot(color = 'r',ax=ax) handles, labels = ax.get_legend_handles_labels() #convert the RI numbers to COnumbers for the figure for ia, a in enumerate(labels): #add compound/gene name to the legend if a[0]== 'R': tLabel = convertRItoCO(CO_fromMATLAB,a) fn = kegg_list(tLabel).read() labels[ia] = fn elif a[0] == 'K': fn = kegg_list(a).read() labels[ia] = fn ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5])) fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn)) pngName = 'pathway' + item + '_' + m.group(0) + '.png' fig.savefig(directoryPNG + '/' + pngName, bbox_inches = 'tight') pngName = None #empty it in case that is where I am having issues plt.close() if len(plotPathway)>0: ## plot the pathway map for this pathway, get details from KEGG for plotting useColors = pal.colorbrewer.qualitative.Set1_4.hex_colors useColors.insert(0,'#f7f7f7') ## insert white at beginning # order of colors: white, red, blue,green,purple sd = 0 #not in dataset sk = 1 #in K means group and pathway sa = 2 #in pathway, in any K means (for genes, bc overlap in numbers) sn = 3 #in pathway, not in K means group (compounds only) su = 4 #unconnected gene or compound line1 = useColors[sd] + ', not in dataset' + '\n' line2 = useColors[sk] + ', in K means group and pathway' + '\n' line3 = useColors[sa] + ', #in pathway, in any K means (for genes, bc overlap in numbers)' +'\n' line4 = useColors[sn] + ', #in pathway, not in K means group (compounds only)' + '\n' line5 = useColors[su] + ', #unconnected gene or compound' + '\n' file = open("readme_colorsInPathways.txt", "w") file.write(line1 + line2 + line3 + line4 + line5) file.close() pathway = KGML_parser.read(kegg_get(item, "kgml")) for element in pathway.orthologs: #print element.name for graphic in element.graphics: tg = element.name[3:9] #skip over the 'ko:' if (tg in intGenes): #in the pathway AND in the set for this particular K means group graphic.bgcolor = useColors[sk] # #if this is something in the pathway, plot up the species for the K number if tg in Insitu_TPM_DIA.index.tolist(): Dk=Insitu_TPM_DIA.loc[tg] else: Dk = 0/Insitu_TPM_DIA.iloc[0] #make an empty frame if tg in Insitu_TPM_DIN.index.tolist(): Nk=Insitu_TPM_DIN.loc[tg] else: Nk = 0/Insitu_TPM_DIN.iloc[0] if tg in Insitu_TPM_Oth.index.tolist(): Ok=Insitu_TPM_Oth.loc[tg] else: Ok = 0/Insitu_TPM_Oth.iloc[0] fig,ax=plt.subplots(1) ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0) ax.set_xticks(range(5)) ax.set_xticklabels([1,2,3,4,5]) ax.set_ylabel('In situ TPM') plt.title(tg + ', lt orange=diatoms, blue=dinos, dk orange=other') fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight') plt.close() elif (tg in fullSet) and (tg in genes) and (tg not in intGenes): #in the pathway AND in the set of genes from RI, allow any Kmeans group for genes graphic.bgcolor = useColors[sa] # elif (tg not in fullSet) and (tg in genes) and (tg not in KO_Norm2Mean.index.tolist()): #in the pathway, but *not* in anything from the RI samples graphic.bgcolor = useColors[sd] # elif (tg not in fullSet) and (tg in genes) and (tg in KO_Norm2Mean.index.tolist()): #an unconnected gene in the RI data graphic.bgcolor = useColors[su] # # Change the colours of compounds (mostly same as genes for element in pathway.compounds: for graphic in element.graphics: tc = element.name[4:10] #skip over the 'cpd:' if (tc in intCompounds): #in the pathway AND in the set for this particular K means group graphic.bgcolor = useColors[sk] # graphic.width = size graphic.height = size elif (tc in fullSet) and (tc in compounds) and (tc not in intCompounds): #in the pathway AND in the set of compounds from RI, but *not* in this Kmeans group graphic.bgcolor = useColors[sn] # graphic.width = size graphic.height = size elif (tc not in fullSet) and (tc in compounds) and (tc not in CO_fromMATLAB.cNumber.values): #in the pathway, but *not* in anything from the RI samples graphic.bgcolor = useColors[sd] # elif (tc not in fullSet) and (tc in compounds) and (tc in CO_fromMATLAB.cNumber.values): #seems like a hack #unconnected compound in the RI data graphic.bgcolor = useColors[su] # graphic.width = size graphic.height = size canvas = KGMLCanvas(pathway, import_imagemap=True) pdfName = 'mapWithColors_' + str(item) + '.pdf' canvas.draw(directoryPDF + '/' + pdfName) pdfName = None #empty it in case that is where I am having issues #stick the pathway information into gatherCounts before I export... #want to export gatherCounts, with the added pathway name as a new column gatherCounts['pathwayInfo'] = '' gatherCounts['pathwayGroup_A'] = '' gatherCounts['pathwayGroup_B'] = '' gatherCounts['pathwayGroup_C'] = '' #go read in the file from KEGG D = glob.glob('br08901.keg') #from http://www.genome.jp/kegg-bin/get_htext?br08901.keg; 3/15/2016 allBRITE=[] for idx,nof in enumerate(D): allBRITE = ReadBRITEfile(nof) #put the pathway name and group into the data frame before exporting it for item in gatherCounts.index: #if this error appears: IndexError: index 0 is out of bounds for axis 0 with size 0 #KEGG has updated a pathway, but not the BRITE file (see below for work around) pathstr = kegg_list(item).read() #this next line splits the string at the '\t', then keeps the piece at index = 1, and strips off the '\n' gatherCounts.loc[item,('pathwayInfo')] = pathstr.split('\t')[1].rstrip() t = allBRITE.loc[allBRITE['map']==item[2:]] #put in a check to see if t.empty ...will be empty if KEGG updated pathway and not BRITE file if t.empty is False: gatherCounts.set_value(item,'pathwayGroup_A',t['A'].values[0]) gatherCounts.set_value(item,'pathwayGroup_B',t['B'].values[0]) gatherCounts.set_value(item,'pathwayGroup_C',t['C'].values[0]) return gatherCounts
def gatherDetails(enterPathway,folderName,useCO,CO_values): #check if the directories exist, one for pathway files if not os.path.exists(folderName): os.makedirs(folderName) #else: #raise ValueError('Be careful, this folder already exists') #only one pathway at a time setKeep = 1 try: kegg_get(enterPathway).read() except: #use the ko map if there is nothing species specific...this can also fail... usePathway = 'ko' + enterPathway[3:8] setKeep = 0 try: kegg_get(usePathway).read() except: pass if setKeep: usePathway = enterPathway #get the compounds and genes for this pathway genes = getKfrom_ko(usePathway) compounds = getCfrom_ko(usePathway) #figure out which ones I have data for... setG = set(genes) setC = set(compounds) setT = set(useCO) intCompounds = setC.intersection(setT) ## plot the pathway map for this pathway, get details from KEGG for plotting (%must be at least 4 colors) useColors = pal.colorbrewer.diverging.PuOr_4.hex_colors #useColors = pal.colorbrewer.diverging.RdYlBu_11.hex_colors #set the color of the mtab based on its value, only scale the values from this particular pathway useCOsubset = CO_values.loc[intCompounds] cmin = useCOsubset.min() #find min and max...ignore NaN and inf for the moment cmax = useCOsubset.replace([np.inf],np.nan).dropna(how = 'all').max() size = 20 #increase the size of the compounds in the plots #can have all zeros... if sum(useCOsubset.dropna())==0: pass #print('No measured metabolites in pathway ' + usePathway) elif len(useCOsubset.value_counts())==1: #only two color options: yes/no dummy = useCOsubset.copy(deep = True) dummy.replace([np.inf],np.nan,inplace = True) for idx,item in enumerate(useCOsubset): if np.isnan(item): useCOsubset.iloc[idx] = int(0) else: useCOsubset.iloc[idx] = int(1) #go get the pathway information and customize the plot pathway = KGML_parser.read(kegg_get(usePathway, "kgml")) #no choice in gene color: green # Change the colors of compounds for element in pathway.compounds: for graphic in element.graphics: tc = element.name[4:10] #skip over the 'cpd:' if (tc in intCompounds): #in the pathway, set the color tempColor = useCOsubset.loc[tc] graphic.bgcolor = useColors[int(tempColor)] graphic.width = size graphic.height = size canvas = KGMLCanvas(pathway, import_imagemap=True) pdfName = 'mapWithColors_' + str(usePathway) + '.pdf' canvas.draw(folderName + '/' + pdfName) pdfName = None #empty it in case that is where I am having issues else: dummy = useCOsubset.copy(deep = True) dummy.replace([np.inf],np.nan,inplace = True) for idx,item in enumerate(useCOsubset): if np.isnan(item): useCOsubset.iloc[idx] = 0 elif np.isinf(item): useCOsubset.iloc[idx] = 10*cmax #make inf 10x the biggest value #now, find cmax again...use that downstream cmax = useCOsubset.replace([np.inf],np.nan).dropna(how = 'all').max() #use histogram to make the bins (complete hack) a,bin_edges = np.histogram(useCOsubset,bins = len(useColors)-3,range = (cmin,cmax)) #now...put zero at beginning and inf at end #BUT - can actually have cases with values for all metabolites (novel concept) try: nz = useCOsubset.value_counts()[0] #count the number of zeros a = np.insert(a,0,nz) bin_edges = np.insert(bin_edges,0,0) except: pass try: nm = useCOsubset.value_counts()[cmax] a = np.append(a,nm) bin_edges = np.append(bin_edges,cmax) except: pass #then find the index for each number...this will be the index into useColors useIdx = np.digitize(useCOsubset,bin_edges) color_df = pd.DataFrame({'mtab': useCOsubset,'idx':useIdx}) #go get the pathway information and customize the plot pathway = KGML_parser.read(kegg_get(usePathway, "kgml")) #no choice in gene color: green # Change the colors of compounds for element in pathway.compounds: for graphic in element.graphics: tc = element.name[4:10] #skip over the 'cpd:' if (tc in intCompounds): #in the pathway, set the color tempColor = color_df.loc[tc,'idx'] graphic.bgcolor = useColors[int(tempColor)-1] graphic.width = size graphic.height = size canvas = KGMLCanvas(pathway, import_imagemap=True) pdfName = 'mapWithColors_' + str(usePathway) + '.pdf' #Tracer()() canvas.draw(folderName + '/' + pdfName) pdfName = None #empty it in case that is where I am having issues
def map2highlighted_map(map_id, ko_list, ko2freq, biodb, outpath='test.pdf', taxon_id=False, n_species=60): import re from chlamdb.biosqldb import shell_command from Bio.Graphics.KGML_vis import KGMLCanvas from Bio.Graphics import KGML_vis import urllib.request from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation import Bio.KEGG.KGML.KGML_pathway from Bio.KEGG.KGML import KGML_parser from Bio.Graphics.ColorSpiral import ColorSpiral import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl values = [float(i) for i in ko2freq.values()] norm = mpl.colors.Normalize(vmin=0, vmax=n_species) cmap = cm.OrRd cmap2 = cm.Greens m = cm.ScalarMappable(norm=norm, cmap=cmap) m2 = cm.ScalarMappable(norm=norm, cmap=cmap2) url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub( 'map', 'ko', map_id) print(url_template) f = urllib.request.urlopen(url_template) from Bio.Graphics import KGML_vis pathway = KGML_parser.read(f.read().decode('UTF-8')) kgml_map = KGMLCanvas(pathway, show_maps=True) # Let's use some arbitrary colours for the orthologs cs = ColorSpiral(a=2, b=0.2, v_init=0.85, v_final=0.5, jitter=0.03) # Loop over the orthologs in the pathway, and change the # background colour orthologs = [e for e in pathway.orthologs] for o in orthologs: match = False if 'K00163' in o.name: print('##################################') ko_temp_list = set([i.rstrip() for i in o.name.split('ko:')]) if len(ko_temp_list.intersection(set(ko2freq.keys()))) > 0: ko_keep = [] for ko in ko_temp_list: if ko in ko2freq: ko_keep.append(ko) if ko in ko_list: match = True o.name = 'ko:' + ' ko:'.join(ko_keep) total = sum([ int(ko2freq[i]) for i in ko_temp_list.intersection(set(ko2freq.keys())) ]) for g in o.graphics: if match: g.bgcolor = rgb2hex(m2.to_rgba(float(total))) else: #print 'no match!!!!' #print ko_temp_list #print ko2freq.keys() #print 'TOTAL:', total g.bgcolor = rgb2hex(m.to_rgba(float(total))) o.name = "%s (%s)" % (o.name.split('ko:')[0], total) #else: # for g in o.graphics: # g.bgcolor = '#FFFFFF' # Default settings are for the KGML elements only # We need to use the image map, and turn off the KGML elements, to see # only the .png base map. We could have set these values on canvas # instantiation kgml_map.import_imagemap = True kgml_map.show_maps = True kgml_map.show_orthologs = True kgml_map.draw_relations = False kgml_map.show_compounds = False kgml_map.show_genes = False kgml_map.show_compounds = False kgml_map.show_genes = False kgml_map.draw(outpath) ''' print 'DIRLISAT:', dir(pathway) maps = [m for m in pathway.maps] for map in maps: for g in map.graphics: print g.name ''' #print re.sub('pdf', 'svg', outpath) shell_command.shell_command( 'inkscape %s --export-plain-svg=%s' % (outpath, re.sub('pdf', 'svg', outpath))) # 'pdf2svg %s %s all' t = edit_svg_map("%s" % re.sub('pdf', 'svg', outpath), ko2freq.keys(), biodb, map_id, taxon_id=taxon_id) #print "%s" % re.sub('pdf', 'svg', outpath) t.write("%s" % re.sub('pdf', 'svg', outpath))
def kegg_get_pathway(identifier): identifier_sanitised = identifier.replace("path:", "") identifier_sanitised = identifier_sanitised.replace("map", "hsa") pathway_kgml = KEGG_REST.kegg_get(identifier_sanitised, "kgml") return KEGG_KGML_PARSER.read(pathway_kgml)
file=open(os.path.join(KEGG_data_folder,identifier+'.kgml'),'w') file.write(KGML_handle.read()) file.close() #%% parse_pathways=1 if parse_pathways: pathways=[] lengths=[] for filename in os.listdir(KEGG_data_folder): if not filename.endswith('kgml'): continue #pathways.append({}) KGML_handle=open(os.path.join(KEGG_data_folder,filename))#open('/home/grosstor/Desktop/steady_ready_projects/response_logic_synthetic_benchmarks/KEGG/hsa00515.xml') #KGML_handle=open(identifier+'.kgml') pathway=KGML_parser.read(KGML_handle) net=nx.DiGraph() for relation in pathway.relations: #print(relation.entry1.id,'<->',relation.entry2.id) net.add_edge(relation.entry1.id,relation.entry2.id) lengths.append(len(net)) if (len(net)>5) & (len(net)<100): # print(len(net)) #generate some extra information from net# gold_Jac=nx.adj_matrix(net).toarray().T cycle_list=[set(cycle) for cycle in nx.simple_cycles(net) ] #maybe needed later to correlate results with number of nodes that are in loops #find distinct aggregated cycles aggregated_cycles=[] while cycle_list: aggr_cycle=cycle_list.pop()
max_statements = 200 num_statements = 0 total_num_statements = 0 tx = graph.cypher.begin() for filename in files: url = datapath + filename print ("Loading file " + url) currentNodes = set() with open(url) as f: pathway = KGML_parser.read(f, 'r') # Maps from the reaction id (e.g. rn:R05134) to the Node Objects that are part of this reaction reactionToNode = dict() # Maps from the internal pathway id (e.g. 23) to the compound id (e.g. C00010) compoundDict = dict() for gene in pathway.genes: for geneName in gene.name.split(" "): gene_id = geneName[4:] if gene_id not in nodes: add_node(tx, ["_Network_Node", "Gene"], gene_id, {"name": gene_id, "idType": "ENTREZ"}) num_statements += 1 total_num_statements += 1 if commit(tx, num_statements, max_statements, total_num_statements): num_statements = 0 tx = graph.cypher.begin()
#open("ec_5.4.2.2.txt",'w').write(request.read()) #records = Enzyme.parse(open("ec_5.4.2.2.txt")) #record = list(records)[0] #print(record.classname) #print(record.entry) organisms = REST.kegg_list("organism").read() organismlist = [] for line in organisms.rstrip().split("\n"): #print(line) code = line.split("\t")[1] organismlist.append(code) #print(organismlist) #parser = KGML_parser.KGMLparser() #open("human_map.xml",'w').write(REST.kegg_get("hsa05130",option="kgml").read()) human_map = KGML_parser.read(REST.kegg_get("hsa01100",option="kgml")) cpds = human_map.compounds for cpd in cpds: print(cpd.name) graphics = cpd.graphics for graphic in graphics: print(graphic.x) rxns = human_map.reaction_entries for rxn in rxns: print(rxn.name) graphics = rxn.graphics for graphic in graphics: print(graphic.x)
def test_parse_remote_pathway(self): """Download a KEGG pathway from the KEGG server and parse KGML.""" with kegg_get("ko03070", "kgml") as handle: pathway = KGML_parser.read(handle) self.assertEqual(pathway.name, "path:ko03070")