def download_all_kegg_pathways(species_code='mmu'): """ """ pathways_str = REST.kegg_list("pathway", species_code).read() pathways = {p.split('\t')[0]:{'name':p.split('\t')[1]} for p in pathways_str.rstrip().split('\n')} def get_genes_for(pathways): for pathway in pathways: pathways[pathway]['geneid'] = set(); pathways[pathway]['gene_symbol'] = set() pathway_file = REST.kegg_get(pathway).read() # query and read each pathway # iterate through each KEGG pathway file, keeping track of which section # of the file we're in, only read the gene in each pathway current_section = None for line in pathway_file.rstrip().split("\n"): section = line[:12].strip() # section names are within 12 columns if not section == "": current_section = section if current_section == "GENE": try: gene_identifiers, _ = line[12:].split("; ")[:2] geneid, gene_symbol = gene_identifiers.split() pathways[pathway]['geneid'].add(int(geneid)) pathways[pathway]['gene_symbol'].add(gene_symbol) except: pass#print('Discarded:', line); get_genes_for(pathways) return pathways
def load_taxon_id(self, in_code=None): ''' lets try getting it directly from KEGG based on inputted organism 3-letter code a bit hairy but it works! TODO: cache the org_table and gen_table in cache/''' if self.taxon_id is not None: return self.taxon_id import Bio.KEGG.REST as kegg ## requires BioPython 1.65 or later! if in_code is None: in_code = self.tables['run_infos'].organism[0] org_table = kegg.kegg_list('organism').readlines() org_table = ''.join(org_table) buf = cStringIO.StringIO(org_table) org_table = pd.read_table(buf, sep='\t', header=None) #full_org_name = org_table.ix[org_table[1]==in_code][2].values[0] buf.close() kegg_code = org_table.ix[org_table[1] == in_code][0].values[0] gen_table = kegg.kegg_list('genome').readlines() gen_table = ''.join(gen_table) buf = cStringIO.StringIO(gen_table) gen_table = pd.read_table(buf, sep='\t', header=None) buf.close() taxon_id = int( gen_table.ix[gen_table[0] == 'genome:' + kegg_code][1].values[0].split(', ')[2].split('; ')[0]) self.taxon_id = taxon_id return taxon_id
def get(dbentry, option=None): # Create a static method which invokes the REST.kegg_get function from the Biopython module. # Return the result as a bytes string or a KEGG_Entries type when option is None if option == None: return KEGG_Entry(REST.kegg_get(dbentry).read()) else: return str.encode(REST.kegg_get(dbentry).read())
def add_kegg_descript2(hit): try: desc = REST.kegg_find("genes", hit).read() try: K = re.search(r"K[0-9]{5}", desc) KEGG = K.group(0) except: KEGG = "none" try: a = re.search(r"(?<=K[0-9]{5}).*", desc).replace("\n", "") ann = a.group(0) except: try: ann = desc.split("\t")[1].split(";")[0].replace("\n", "") except: ann = "none" try: mod = REST.kegg_link('module', hit).read() module = mod.split(":")[2].split("_")[-1].replace("\n", "") except: module = "none" except: module = "none" KEGG = "none" ann = "none" ann = reduce_func_len(ann) return strip_lines_list([module, KEGG, ann])
def add_kegg_descript2(hit): try: desc= REST.kegg_find("genes", hit).read() try: K=re.search(r"K[0-9]{5}", desc) KEGG=K.group(0) except: KEGG="none" try: a=re.search(r"(?<=K[0-9]{5}).*", desc).replace("\n","") ann=a.group(0) except: try: ann=desc.split("\t")[1].split(";")[0].replace("\n","") except: ann="none" try: mod=REST.kegg_link('module', hit).read() module=mod.split(":")[2].split("_")[-1].replace("\n","") except: module="none" except: module="none" KEGG="none" ann="none" ann=reduce_func_len(ann) return strip_lines_list([module, KEGG, ann])
def kooo(cccc): mydog5 = "" mydog = REST.kegg_find("genes", cccc).read() #print(result) mydog1 = re.findall('^\S+', mydog)[0] #print(mydog1) mydog3 = REST.kegg_link("ko", mydog1).read() print("xxx", mydog3) if (len(mydog3) < 4): return (mydog5) mydog4 = re.findall('ko:\S+', mydog3)[0] #print(mydog4) mydog5 = REST.kegg_link("genes", mydog4).read() return (mydog5)
def get_kegg(self, K_number): print(K_number) self.cur.execute( """SELECT K_number from kegg_reference where K_number = ?""", (K_number, )) if self.cur.fetchone(): return name = definition = identifier = ec_number = None try: kegg_list = REST.kegg_list(K_number).read() identifier, definition = kegg_list.strip().split("\t", 1)[1:][0].split( ";", 1) definition = definition.rstrip("]").split("[EC:") name = definition[0] ec_number = None if len(definition) == 2: ec_number = definition[1] except Exception as e: sys.stderr.write("\t".join([K_number, str(e)])) finally: self.cur.execute( """INSERT OR IGNORE INTO kegg_reference(K_number, name, identifier, ec_number) VALUES(?,?,?,?)""", (K_number, name, identifier, ec_number)) sys.stderr.write(str(self.cur.lastrowid) + "\n") self.conx.commit()
def _download_links(self, dbs=["pathway", "enzyme", "reaction", "compound"]): """ Returns jsons of mappings between each db (default: map (pathway), ec, rn, cpd). """ for sourcedb, targetdb in itertools.permutations(dbs, 2): links_raw = REST.kegg_link(targetdb, sourcedb) links = [s.split('\t') for s in links_raw.read().splitlines()] d = dict() for i in links: if i[0] in d: d[i[0]].append(i[1]) else: d[i[0]] = [i[1]] ## Write json of all entry ids and names link_fname = sourcedb + "_" + targetdb links_path = os.path.join(self.path, 'links') if not os.path.exists(links_path): os.makedirs(links_path) link_path = os.path.join(links_path, link_fname + ".json") with open(link_path, 'w') as f: json.dump(d, f, indent=2)
def get_kegg_gene_to_external_map(species): """Maps kegg genes to external gene names. Legacy function for goverlap. Deprecated. """ kegg_list = REST.kegg_list(species) clean_kegg_info = re.compile(r"{}:|\n".format(species)) parse_kegg_info = re.compile(r"[^\t;\n]+") rowdicts = [] for kegg_info in kegg_list: try: kegg_info = kegg_info.decode("utf-8") except AttributeError: pass kegg_info = re.sub(clean_kegg_info, "", kegg_info) kegg_data = re.findall(parse_kegg_info, kegg_info) for gene in kegg_data[1].split(", "): rowdict = {"entrezgene": kegg_data[0], "gene": gene} rowdicts.append(rowdict) return DataFrame.from_dict(rowdicts)
def extractGeneFromPathway(pathway): # pathway = 'path:hsa00230' print('query http://rest.kegg.jp/get/path:%s' % pathway) pathway_file = REST.kegg_get(pathway).read() # query and read each pathway # iterate through each KEGG pathway file, keeping track of which section # of the file we're in, only read the gene in each pathway return extractGeneFromLocalPathway(pathway_file)
def add_kegg_descript(hit): desc = REST.kegg_find("genes", hit).read() K = re.search(r"K[0-9]{5}", desc) KEGG = K.group(0) a = re.search(r"(?<=K[0-9]{5}).*", desc) ann = a.group(0) return [KEGG, ann]
def add_kegg_descript(hit): desc= REST.kegg_find("genes", hit).read() K=re.search(r"K[0-9]{5}", desc) KEGG=K.group(0) a=re.search(r"(?<=K[0-9]{5}).*", desc) ann=a.group(0) return [KEGG, ann]
def __get_all_information(self): if self.byModules: info = self.get_raw_data().split("\n") self.modules = [] go = False for text in info: if re.search("\AMODULE", text): new_line = re.sub('\s+', "\t", text) module = new_line.split("\t")[1] self.modules.append(module) go = True elif (re.search("\ADISEASE", text) or re.search("\ADBLINKS", text) or re.search("\AREFERENCE", text) or re.search("\AKO_PATHWAY", text)): go = False elif go: new_line = re.sub('\s+', "\t", text) module = new_line.split("\t")[1] self.modules.append(module) else: data = kegg_api.kegg_link("reaction", "path:" + self.id) self.reactions = [] for line in data: reaction = line.strip().split("\t")[1].split(":")[1] self.reactions.append(reaction)
def create_id_name_dict(db): ## Grab list of ids in db id_name_dict = dict() raw_list = REST.kegg_list(db) id_name_list = [s.split('\t') for s in raw_list.read().splitlines()] for i in id_name_list: id_name_dict[i[0]] = i[1] return id_name_dict
def get_kegg_annotation(): """ Get annotation ec:5.4.2.2 from kegg database and write to txt file. Print annotation :return: """ request = REST.kegg_get("ec:5.4.2.2") open("ec_5.4.2.2.txt", "w").write(request.read()) records = Enzyme.parse(open("ec_5.4.2.2.txt")) record = list(records)[0] print(record.classname)
def kegg_search(database, query): result = KEGG_REST.kegg_find(database, query.replace(" ", "+")) result_lines = result.read().split('\n') result_lines = result_lines[:-1] if result_lines[0] == "": return [] output = [] for result in result_lines: output.append(result.split('\t')[0]) return output
def save_all_kegg_pathway_files(paths): """Uses the KEGG REST API to find and save all pathway data files for each species in the input dictionary. Args: paths (dict of str:str): A mapping between strings referencing species and paths to the output directory for each. """ for species, path in paths.items(): pathways = REST.kegg_list("pathway", species) for pathway in pathways: # Get the pathway file contents through the REST API. pathway_id = pathway.split()[0] pathway_file = REST.kegg_get(dbentries=pathway_id).read() # Where should the contents of the obtained file be written? pathway_id_str = pathway_id.replace(":", "_") filename = os.path.join(path, "{}.txt".format(pathway_id_str)) if not os.path.exists(path): os.makedirs(path) with open(filename, "w") as outfile: outfile.write(pathway_file)
def make_kegg(ec_list): from Bio.KEGG import REST for ec in ec_list: print("Tratar de kegg...") try: keggname = "Kegg\\"+ec+".txt" if not os.path.isfile(keggname): request = REST.kegg_get(ec) open(keggname, 'wb').write(request.read()) print("Kegg SUCCESS!!!") except: print('kegg request failed or file already exists')
def queryAllPathway(fpathway=None, fpathwayInfo=None, hsa='hsa'): human_pathways = REST.kegg_list("pathway", hsa).read() repair_pathways = [] repair_pathways_info = [] for line in human_pathways.rstrip().split("\n"): entry, description = line.split("\t") entry = entry.split(':')[1] repair_pathways.append(entry) repair_pathways_info.append((entry, description)) if fpathway: saveList(repair_pathways, fpathway) if fpathwayInfo: saveList(repair_pathways_info, fpathwayInfo) return repair_pathways
def get_EC_num(geneName): # retrieve gene data from KEGG keggData = REST.kegg_find('genes', geneName).read() keggData = "".join(keggData) keggData = keggData.lower() keggData = keggData.splitlines() # find which line 'eco' exists in the returned values to get enzyme name enzymeNameLine = '' for line in keggData: if line.find('eco:') != -1: enzymeNameLine = line break if enzymeNameLine == '': return '' else: enzymeName = enzymeNameLine[enzymeNameLine.index('\t') + 1:enzymeNameLine.index(';')] # find enzyme name in KEGG and get ECNums associated to it keggData = REST.kegg_find('enzyme', enzymeName).read() keggData = "".join(keggData) keggData = keggData.lower() keggData = keggData.splitlines() ecNumList = [] for line in keggData: try: ecNumList.append(line[line.index(':') + 1:line.index('\t')]) except ValueError: return '' if ecNumList == []: return '' else: ecNumList = ','.join(ecNumList) return ecNumList
def find_kegg(genes): count=0 lpl_pathways = REST.kegg_list("pathway", "lpl").read() entries = [] for line in lpl_pathways.rstrip().split("\n"): entry, description = line.split("\t") #print(line) entries.append(entry) print(entries) pathway = {} for i in genes: for entry in entries: count+=1 get = REST.kegg_get(entry, option=None) get_read = get.readlines() if any(i in s for s in get_read): print(entry) print(i) #checkt of j als een k in de dictionary staat, maakt een lijst van alle values van de key wanneer dit zo is en updat de key met de ljst+ nieuwe gen id) if i in pathway: k = pathway.get(i) #print(k) m = [] if isinstance(k, list): for l in k: m.append(l) else: m.append(k) if entry not in m: m.append(entry) pathway.update({i:m}) #voegt j als een nieuwe key toe aan de dictionary else: pathway[i] = [] pathway.update({i:entry}) print(pathway, count) print(pathway) return pathway
def update_files(base_dir="/data/databases/kegg/"): for db in ["pathway", "ko", "cpd", "brite"]: with open(base_dir + db + ".txt", "w") as h: data = REST.kegg_list(db).read() h.write(data) # wget http://www.kegg.jp/kegg-bin/download_htext?htext=br08901.keg&format=json&filedir= # wget http://www.kegg.jp/kegg-bin/download_htext?htext=br08001.keg&format=json&filedir= L = list(open(base_dir + "pathway.txt")) for pathway in tqdm(L): pw = "ko" + pathway.split()[0].split(":map")[1] kgmlpath = base_dir + "ko/" + pw + ".kgml" if not os.path.exists(kgmlpath): with open(kgmlpath, "w") as h: try: data = REST.kegg_get(pw, option="kgml").read() h.write(data) sleep(1) except: pass
def query_reversible_reaction(reaction_list): """ get the list of reversible reaction input:list of reactions(list) eg)["R00709"] output:list of reversible reactions(list) """ reversible_reaction = [] for reaction in reaction_list: reaction_file = REST.kegg_get(reaction).read() for i in reaction_file.rstrip().split("\n"): if i.startswith("EQUATION") and "<=>" in i: reversible_reaction.append(reaction) return reversible_reaction
def get_kegg_path_to_gene_map(species): """Map kegg paths to genes.""" kegg_list = REST.kegg_link(species, "pathway") clean_kegg_path_to_gene = re.compile(r"path:{0}|{0}:|\n".format(species)) rowdicts = [] for kegg_info in kegg_list: kegg_info = re.sub(clean_kegg_path_to_gene, "", kegg_info) kegg_data = kegg_info.split("\t") rowdict = {"kegg_pathway": kegg_data[0], "kegg_gene": kegg_data[1]} rowdicts.append(rowdict) return DataFrame.from_dict(rowdicts)
def get_genes_for(pathways): for pathway in pathways: pathways[pathway]['geneid'] = set(); pathways[pathway]['gene_symbol'] = set() pathway_file = REST.kegg_get(pathway).read() # query and read each pathway # iterate through each KEGG pathway file, keeping track of which section # of the file we're in, only read the gene in each pathway current_section = None for line in pathway_file.rstrip().split("\n"): section = line[:12].strip() # section names are within 12 columns if not section == "": current_section = section if current_section == "GENE": try: gene_identifiers, _ = line[12:].split("; ")[:2] geneid, gene_symbol = gene_identifiers.split() pathways[pathway]['geneid'].add(int(geneid)) pathways[pathway]['gene_symbol'].add(gene_symbol) except: pass#print('Discarded:', line);
def get_pathway_to_definition_map(species): """Map kegg paths to their definition.""" kegg_list = REST.kegg_list("pathway", species) clean_kegg_path = re.compile(r"path:{}|\n".format(species)) rowdicts = [] for kegg_path_line in kegg_list: kegg_info = re.sub(clean_kegg_path, "", kegg_path_line) pathway, definition = kegg_info.split("\t") definition = definition.split(" - ")[0] # Remove species info rowdict = {"kegg_pathway": pathway, "kegg_pathway_definition": definition} rowdicts.append(rowdict) return DataFrame.from_dict(rowdicts)
def get_kegg_gene_to_external_map(species): """Maps kegg genes to external gene names.""" kegg_list = REST.kegg_list(species) clean_kegg_info = re.compile(r"{}:|\n".format(species)) parse_kegg_info = re.compile(r"[^\t;\n]+") rowdicts = [] for kegg_info in kegg_list: kegg_info = re.sub(clean_kegg_info, "", kegg_info) kegg_data = re.findall(parse_kegg_info, kegg_info) for gene in kegg_data[1].split(", "): rowdict = {"kegg_gene": kegg_data[0], "gene": gene} rowdicts.append(rowdict) return DataFrame.from_dict(rowdicts)
def main(pathway): print "Fetching gene names related to pathway %s from the current KEGG database..." % pathway promotor_gene_accessions = [] pathway_file = REST.kegg_get(pathway).read() # query and read each pathway # iterate through each KEGG pathway file, keeping track of which section # of the file we're in, only read the gene in each pathway current_section = None for line in pathway_file.rstrip().split("\n"): section = line[:12].strip() # section names are within 12 columns if not section == "": current_section = section if current_section == "GENE": gene_identifiers, gene_description = line[12:].split("; ") gene_id, gene_symbol = gene_identifiers.split() if gene_symbol not in promotor_gene_accessions: promotor_gene_accessions.append( "promotor_region_" + gene_symbol + "_" + gene_id) return promotor_gene_accessions
def t_KEGG_Query(): """Tests Bio.KEGG API Wrapper""" print("Testing Bio.KEGG.query\n\n") # info tests resp = REST.kegg_info("kegg") resp.read() print(resp.url) resp = REST.kegg_info("pathway") resp.read() print(resp.url) # list tests resp = REST.kegg_list("pathway") resp.read() print(resp.url) resp = REST.kegg_list("pathway", "hsa") resp.read() print(resp.url) resp = REST.kegg_list("organism") resp.read() print(resp.url) resp = REST.kegg_list("hsa") resp.read() print(resp.url) resp = REST.kegg_list("T01001") resp.read() print(resp.url) resp = REST.kegg_list("hsa:10458+ece:Z5100") resp.read() print(resp.url) resp = REST.kegg_list(["hsa:10458", "ece:Z5100"]) resp.read() print(resp.url) resp = REST.kegg_list("cpd:C01290+gl:G00092") resp.read() print(resp.url) resp = REST.kegg_list(["cpd:C01290", "gl:G00092"]) resp.read() print(resp.url) resp = REST.kegg_list("C01290+G00092") resp.read() print(resp.url) resp = REST.kegg_list(["C01290", "G00092"]) resp.read() print(resp.url) # find tests resp = REST.kegg_find("genes", "shiga+toxin") resp.read() print(resp.url) resp = REST.kegg_find("genes", ["shiga", "toxin"]) resp.read() print(resp.url) resp = REST.kegg_find("compound", "C7H10O5", "formula") resp.read() print(resp.url) resp = REST.kegg_find("compound", "O5C7", "formula") resp.read() print(resp.url) resp = REST.kegg_find("compound", "174.05", "exact_mass") resp.read() print(resp.url) resp = REST.kegg_find("compound", "300-310", "mol_weight") resp.read() print(resp.url) # get tests resp = REST.kegg_get("cpd:C01290+gl:G00092") resp.read() print(resp.url) resp = REST.kegg_get(["cpd:C01290", "gl:G00092"]) resp.read() print(resp.url) resp = REST.kegg_get("C01290+G00092") resp.read() print(resp.url) resp = REST.kegg_get(["C01290", "G00092"]) resp.read() print(resp.url) resp = REST.kegg_get("hsa:10458+ece:Z5100") resp.read() print(resp.url) resp = REST.kegg_get(["hsa:10458", "ece:Z5100"]) resp.read() print(resp.url) resp = REST.kegg_get("hsa:10458+ece:Z5100", "aaseq") resp.read() print(resp.url) resp = REST.kegg_get(["hsa:10458", "ece:Z5100"], "aaseq") resp.read() print(resp.url) resp = REST.kegg_get("hsa05130", "image") resp.read() print(resp.url) # conv tests resp = REST.kegg_conv("eco", "ncbi-geneid") resp.read() print(resp.url) resp = REST.kegg_conv("ncbi-geneid", "eco") resp.read() print(resp.url) resp = REST.kegg_conv("ncbi-gi", "hsa:10458+ece:Z5100") resp.read() print(resp.url) resp = REST.kegg_conv("ncbi-gi", ["hsa:10458", "ece:Z5100"]) resp.read() print(resp.url) # link tests resp = REST.kegg_link("pathway", "hsa") resp.read() print(resp.url) resp = REST.kegg_link("hsa", "pathway") resp.read() print(resp.url) resp = REST.kegg_link("pathway", "hsa:10458+ece:Z5100") resp.read() print(resp.url) resp = REST.kegg_link("pathway", ["hsa:10458", "ece:Z5100"]) resp.read() print(resp.url)
from Bio.KEGG import REST from bioservices import Reactome import csv from input import inp #gene_list = ['POLD1', 'POLE3', 'ABO', 'TP53'] gene_list = inp specie = "hsa" human_pathways = REST.kegg_list("pathway", specie).read() human_pathways_dict = {} repair_pathways = [] repair_pathways_dict = {} for line in human_pathways.rstrip().split("\n"): entry, description = line.split("\t") human_pathways_dict[entry] = description if "DNA" in description: repair_pathways.append(entry) repair_pathways_dict[entry] = description rejected = [] gene_dict = dict((gene,[]) for gene in gene_list) i = 0 len_ = len(human_pathways_dict.keys()) for pathway in human_pathways_dict.keys(): i += 1 print str(i) + ' // ' + str(len_) #print pathway
from Bio.Seq import Seq from Bio.KEGG import Enzyme from Bio.KEGG import REST from Bio.KEGG.KGML import KGML_parser from Bio.KEGG import Map #request = REST.kegg_get("ec:5.4.2.2") #open("ec_5.4.2.2.txt",'w').write(request.read()) #records = Enzyme.parse(open("ec_5.4.2.2.txt")) #record = list(records)[0] #print(record.classname) #print(record.entry) organisms = REST.kegg_list("organism").read() organismlist = [] for line in organisms.rstrip().split("\n"): #print(line) code = line.split("\t")[1] organismlist.append(code) #print(organismlist) #parser = KGML_parser.KGMLparser() #open("human_map.xml",'w').write(REST.kegg_get("hsa05130",option="kgml").read()) human_map = KGML_parser.read(REST.kegg_get("hsa01100",option="kgml")) cpds = human_map.compounds for cpd in cpds: print(cpd.name) graphics = cpd.graphics for graphic in graphics: print(graphic.x) rxns = human_map.reaction_entries