def create_keggids_csv(filename, org): ''' Extract keggids for an organism and save it to a csv file args: filename is the file containing gene name/ locus for all the organism genes org is the abrievation of the organism in kegg ''' #Open csv as panda dataframe (df) df = pd.read_csv(filename, sep="\t", tupleize_cols=1) gene_list = tuple(df['Locus'].tolist()) bid_list = tuple(df['Locus tag'].tolist()) kid_list = [] k = KEGG() #find keggid for each genes for gene in bid_list: kstrg = (k.find(org, gene)) kid_list.append(kstrg.split()[1]) #create new df and save it to csv new_df = pd.DataFrame(columns=['gene', 'b_id', 'kegg_id']) new_df.gene = gene_list new_df.b_id = bid_list new_df.kegg_id = kid_list new_df.to_csv("ecoli_keggids.csv", sep="\t", index=False)
def print_alignment_kegg(model): f = open("cor.txt") f_o = open("cor_readable.txt", "w") kegg = KEGG() for i in f: if ":***:" in i: k, b = i.split(":***:") b = b.strip() if not k == "MULTIR": k = kegg.get(k) i1 = k.find("NAME") + 4 i2 = k[i1:].find("\n") k = k[i1:i1 + i2].strip() if not b == "MULTIR": b = model.reactions[b] print(k, ":***:", b) f_o.write(k + ":***:" + b + "\n") f.close() f_o.close()
def show_pathway(): """ function that shows p53 pathway in KEGG """ k = KEGG(verbose=True) k.lookfor_pathway("p53 signaling pathway - H**o sapiens (human)") print(k.show_pathway("path:hsa04115"))
def parse_kgml(self, ec_file=""): # http://biopython.org/DIST/docs/api/Bio.KEGG.KGML.KGML_parser-pysrc.html # https://github.com/deep-introspection/kegg-kgml-parser-python/blob/master/keggparser/parse_KGML.py tree = ET.fromstring(self.kgml) for reaction in tree.getiterator('reaction'): r_id = reaction.get('id') r_name = reaction.get( 'name') # lahko je vec imen locenih s presledki r_names = set(reaction.get('name').split()) # mnozica imen self.reactions[r_id] = r_names self.reaction_ids[r_name] = r_id self.listed_reactions.append(r_id) for sub in reaction.getiterator('substrate'): self.reaction_metabolites[r_id].add(sub.get('id')) self.reaction_reactants[r_id].add(sub.get('id')) #substrates.append(sub.get('name')) for prod in reaction.getiterator('product'): self.reaction_metabolites[r_id].add(prod.get('id')) self.reaction_products[r_id].add(prod.get('id')) #products.append(prod.get('name')) self.reversibility_reactions[r_id] = 1 if reaction.get( 'type') == 'reversible' else 0 #reactions[i] = {'reaction': reaction, 'substrates': substrates, 'products': products, 'gene':[], 'reversible': reversible} EC_file_loaded = False if ec_file: try: self.load_ECs(ec_file) EC_file_loaded = True except: self.kegg = KEGG() for entry in tree.getiterator('entry'): if not EC_file_loaded: if entry.get( 'type' ) == 'gene': # or entry.get('type') == 'ortholog': genes = entry.get('name').split() gene_reaction_name = entry.get('reaction') #print(gene_reaction_name) gene_reaction_id = self.reaction_ids[gene_reaction_name] for g in genes: #self.reaction_genes[gene_reaction_id].add(g) EC = self.get_EC(g) #self.gene_EC[g] = EC for e in EC: self.reaction_ECs[gene_reaction_id].add(e) if entry.get('type') == 'compound': metabolite = entry.get('name') metabolite_id = entry.get('id') self.metabolites[metabolite_id] = metabolite self.metabolite_ids[metabolite] = metabolite_id self.listed_metabolites.append(metabolite_id)
def extract_sequences(dict, flist): ''' Get orthologs sequences on KEGG and write to a fasta file for each kegg id arg: dictionnary with keggid as key and orthologs as value (list) ''' k = KEGG() ocount = {} #loop through orthologs dictionnary to get sequences from kegg for key, list in dict.items(): #print(key) if (key + ".fas") in flist: print(key + " is already created !!!") continue #create string with sequences to write fasta file for each genes string = "" for x in range(0, len(list)): for i in range(0, len(list[x])): data_seq = k.get(list[x][i], option="ntseq", parse=True) string = string + data_seq + "\n" #print(data_seq) print("writing : " + key + ".fas") #write file with open(os.path.join('orthologs_fastas/', key + '.fas'), 'w') as f: read_data = f.write(string) f.closed
def pathwayInfo(code): # Function to get info about a pathway, from the code # Intialize searcher kSearcher = KEGG() # Get result and parse it in a dictionnary result = kSearcher.get(code) # Add code at the begining of the list dictResult = kSearcher.parse(result) # Initialize an empty list pathwayList = [] # If name exist as a key in dictionnary, else 'NA' insted pathwayList.append(code) if 'NAME' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script nameStr = str(dictResult['NAME'][0].replace(',', ';')) pathwayList.append(nameStr) else: pathwayList.append('NA') # If class exist as a key in dictionnary, else 'NA' instead if 'CLASS' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script classStr = str(dictResult['CLASS']).replace(',', ';') pathwayList.append(classStr) else: pathwayList.append('NA') return pathwayList
def get_kegg_info(stId): """ Get kegg dict by pathway id. """ k = KEGG() data = k.get(stId) dict_data = k.parse(data) return dict_data
def get_genes_from_kegg_pathway(pathway): from bioservices.kegg import KEGG k = KEGG() k.organism = 'hsa' pathway = k.get(pathway) genes = k.parse(pathway)['GENE'] entrez, symbol = zip(*[i.split(' ') for i in genes]) return symbol
def retrieve_kegg_formula(reactome_compound_name): k = KEGG() compound_name = reactome_compound_name.replace('COMPOUND', 'cpd') res = k.get(compound_name).split('\n') for line in res: if line.startswith('FORMULA'): formula = line.split()[1] # get the second token return formula return None
def get_single_compound_metadata_online(compound_id): if compound_id.upper().startswith('C'): s = KEGG() res = s.get(compound_id) return s.parse(res) else: ch = ChEBI() res = ch.getCompleteEntity('CHEBI:'+compound_id) return res
def load_kegg(gene, organism): k = KEGG() result_line = '' try: a = k.get_pathway_by_gene(gene, organism) if a: k_list = list(a.values()) result_line = ', '.join(k_list) except: print(" Gene '{0}' is not in KEGG database".format(gene)) return result_line
def get_kegg(self, pathway_id): #try: self.kegg = KEGG() kegg = self.kegg #self.kgml = kegg.parse(kegg.get(pathway_id)) #self.pathway = kegg.parse_kgml_pathway(pathway_id) self.kgml = kegg.get(pathway_id, "kgml") self.parse_kgml(pathway_id) self.save_kegg(pathway_id)
def extract_orthologs(filename): ''' Create dictionnary with keggid as key and list of orthologs as value arg: csv with keggids return : dict with orthologs ''' orthos_dict = {} k = KEGG() #get list of gammaproteobacteria from csv df = pd.read_csv(filename, sep="\t", tupleize_cols=1) df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1) gamma_list = df_gamma['KEGG'].tolist() #loop through keggid to get orthologs for keggid in df['kegg_id']: if keggid == "no": continue print(str(keggid)) ortho_list = [] #get orthologs on kegg data = k.get(keggid) dict_data = k.parse(data) if isinstance(dict_data, int): continue #loop through kegg orthologs data and verify that organisms are gammaproteobacteria for key, value in dict_data['GENES'].items(): if key.lower() in gamma_list: # print(key.lower(), value.split('(')[0].split()) para_num = len(value.split('(')[0].split()) para_list = [] for i in range(0, para_num): #print(value.split('(')[0].split()[i]) para_list.append(key.lower() + ":" + value.split('(')[0].split()[i]) ortho_list.append(para_list) orthos_dict[keggid] = ortho_list return orthos_dict
def id2seq(self, hsa): s = KEGG() d = s.get(hsa) dict_d = s.parse(d) pattern = re.compile(r'\s+') try: seq = re.sub(pattern, '', dict_d['AASEQ']) except: seq = '' #print('SEQ:', seq) text_file = open("dummy.txt", "w") text_file.write('>' + str(hsa) + '\n' + seq) text_file.close() return None
def kegg(inputInteractions): from bioservices.kegg import KEGG k = KEGG() interactions = [] for items in inputInteractions: print(items[1].getName()) try: pathways = k.get_pathway_by_gene(items[1].getName(), "hsa") #print(pathways) if pathways: for key, value in list(pathways.items()): interactions.append([items[0], value]) except AttributeError: print("Gene name error!!!!!!!!!") return interactions
def getData(self): ''' Gets all the data for the drugs Obs. IT TAKES TIME. ''' mykegg = KEGG() print 'There are', len(mykegg.drugIds), 'drugs in Kegg' data = dict() # Get data from Kegg database. for num, ID in enumerate(k.drugIds): data[ID] = k.get(ID) print 'Finish!' return data
def get_compound_metadata_online(kegg_ids): s = KEGG() metadata_map = {} for i in range(len(kegg_ids)): try: if i % 10 == 0: print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids))) kegg_id = kegg_ids[i] res = s.get(kegg_id) d = s.parse(res) first_name = d['NAME'][0] first_name = first_name.replace(';', '') # strip last ';' character metadata_map[kegg_id] = {'display_name': first_name} except TypeError: print('kegg_id=%s parsed_data=%s' % (kegg_id, d)) return metadata_map
def get_seq(filename): ''' Create dictionnary with species as keys and sequences as values for an alignment arg: filename with gene name return: organism dictionnary with sequences ''' k = KEGG() records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta")) idlist = [] orglist = [] seqlist = [] orgdict = {} #go through sequences and search for organism name on kegg for record in records: idsplit = (record.id).split('_',1) id = idsplit[0] + ':' + idsplit[1] handle = k.get(id) if isinstance( handle, int ): print(id) continue org = k.parse(handle)['ORGANISM'] org = org.split() org = org[1] +" "+ org[2] seqlist.append(list(str(record.seq))) orglist.append(org) idlist.append(id) duplist = set(orglist) # create dict with organism as key and sequences for organism as values for org in duplist: indices = [i for i, x in enumerate(orglist) if x == org] seqs = [] for e in indices: seqs.append(seqlist[e]) orgdict[org] = seqs #print(orgdict) return orgdict
def queryKegg(theIDs): print("Currently querying KEGG...") k = KEGG() keggData = list() IDlist = list() for id in theIDs: ids = id[3:] query = k.find("acb", ids) query = query.split('\t') finalQuery = query[0] data = k.get(finalQuery) dictData = k.parse(data) keggData.append(dictData) IDlist.append(ids) return keggData, IDlist
def main(): # Start KEGG interface k = KEGG() # Create a dict to store final result data = dict() # Read in KEGG gene ID & gene symbol pairs with open("hsa_gene_list.json", "r") as g: gene_data = json.load(g) for gene in gene_data.keys(): print gene g_data = k.get(gene) g_prsd = k.parse(g_data) data[gene] = g_prsd with open('ginfo.json', 'w') as fw: json.dump(data, fw)
def main(): k = KEGG() # Create a dict to store final result data = dict() # Create list of hsa (human) pathways list_path = open("../hsa_list.txt").read().replace('path:','').split('\n') # Random blank entry removed list_path.pop() i = 0 for hsa in list_path: i+=1 print "# of pathways processed: ",i # Request KGML file for a pathway req_url = 'http://rest.kegg.jp/get/'+hsa+'/kgml' kgml = requests.get(req_url) out = open('pathways/path_'+hsa,'w') out.write(kgml.text) out.close()
def get_reaction_ECs_from_kegg(self): self.reaction_ECs = defaultdict(set) kegg = KEGG() for r in self.model.reactions: ECs = [] try: reacts = r.split(" ") for i in reacts: if i not in self.reaction_ECs: print("KEGG reaction", i) ECs += kegg.parse(kegg.get(i))['ENZYME'] for e in ECs: self.reaction_ECs[i].add(e) except Exception as inst: print(inst) #for e in ECs: # self.reaction_ECs[r].add(e) print("EC data loaded from KEGG")
import numpy as np import pandas as pd import matplotlib.pyplot as plt from bioservices.kegg import KEGG from sklearn.cluster import KMeans np.set_printoptions(threshold=np.nan) data = [] df = pd.read_csv('RPKMs.csv', delimiter=",") k = KEGG() #for i in range(100): # print(i,"****") # print("//\n",k.get_pathway_by_gene(str(df["symbol"][i]), "hsa")) def search_pathways_4_list(list_of_genes): matrix = [[0 for j in range(len(list_of_genes))] for i in range(0)] list_of_pathways = [] dict_of_genes = {} for i, gene in enumerate(list_of_genes): try: pathways = k.get_pathway_by_gene(gene, "hsa") if pathways != None: pathways = pathways.values()
def search(query, source="wikipathways", result_format="xml", species=None, genes=None, user=None): path_array = [] if source.lower() in ["wikipathways", "all"] and species is None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway( identifier=temp_path_dict["identifier"], identifier_type="WikiPathways ID", name=temp_path_dict["name"], taxon=temp_path_dict["species"], source="WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) elif source.lower() in ["wikipathways", "all"] and species is not None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str( species) r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() path_array = [] for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway( identifier=temp_path_dict["identifier"], identifier_type="WikiPathways ID", name=temp_path_dict["name"], taxon=temp_path_dict["species"], source="WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) if source.lower() in ["kegg", "all"] and genes is not None: k = KEGG() elif source.lower() in ["kegg", "all"] and genes is None: k = KEGG() list_of_pathways = k.find("pathway", query) temp_path_list = list_of_pathways.split("\n") for thing in temp_path_list: temp_split = thing.split("\t") if len(temp_split) != 1: path_id = temp_split[0].strip().split(":")[1] path_name = temp_split[1].strip() if "map" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG MAP PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ko" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG KO PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ec" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG EC PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "rn" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG RN PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) else: print(k.get(path_id)) return path_array
def pathwayVisualization(KEGG_id, path_to_csv, redirect=True, compound=False): """ The pathwayVisualization function returns a graph visualization based on user input Args: KEGG_id (str): string specifying KEGG pathway ID to visualize path_to_csv (str): string specifying data to overlay on graph redirect (bool): True to split nodes into their components. Defaults to True compound (bool): True to display compounds (such as Ca2+). Defaults to False Returns: A graph visualization using the visjs_network function from visjs_2_jupyter """ s = KEGG() result = s.parse_kgml_pathway(KEGG_id) ETroot = parsingXML(KEGG_id, s) G=nx.DiGraph() max_id, compound_array = addNodes(G, result) setCoord(G, ETroot) if redirect is False: getNodeSymbols(G, s, compound) else: parent_list, parent_dict = splitNodes(G, s, max_id) complex_array, component_array, node_dict, comp_dict = undefNodes(G, ETroot) if redirect is False: addEdges(G, result, component_array, node_dict) else: addAndRedirectEdges(G, result, complex_array, component_array, parent_list, parent_dict, node_dict, comp_dict) #add reactions to graph addReaction(G, ETroot) edge_to_name = dict() for edge in G.edges(): print edge if G.edge[edge[0]][edge[1]]['name'] == 'phosphorylation': edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value'] elif G.edge[edge[0]][edge[1]]['name'] == 'dephosphorylation': edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value'] elif 'dephosphorylation' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('dephosphorylation', '-p') edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ') elif 'phosphorylation' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('phosphorylation', '+p') edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ') #remove activation and inhibition labels elif 'activation' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].remove('activation') edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ') elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].remove('inhibition') edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ') else: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'] #print edge_to_name[edge] #edges are transparent edge_to_color = dict() for edge in G.edges(): if 'activation' in G.edge[edge[0]][edge[1]]['name']: edge_to_color[edge] = 'rgba(26, 148, 49, 0.3)' #green elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']: edge_to_color[edge] = 'rgba(255, 0, 0, 0.3)' #red else: edge_to_color[edge] = 'rgba(0, 0, 255, 0.3)' #blue #for graph with split nodes if redirect is True: #remove undefined nodes from graph G.remove_nodes_from(complex_array) #remove nodes with more than one gene G.remove_nodes_from(parent_list) if compound is False: #remove compound nodes G.remove_nodes_from(compound_array) node_to_symbol = dict() for node in G.node: if G.node[node]['type'] == 'map': node_to_symbol[node] = G.node[node]['gene_names'] else: if 'symbol' in G.node[node]: node_to_symbol[node] = G.node[node]['symbol'] elif 'gene_names'in G.node[node]: node_to_symbol[node] = G.node[node]['gene_names'] else: node_to_symbol[node] = G.node[node]['name'] # getting name of nodes node_to_gene = dict() for node in G.node: node_to_gene[node] = G.node[node]['gene_names'] # getting x coord of nodes node_to_x = dict() for node in G.node: node_to_x[node] = G.node[node]['x'] # getting y coord of nodes node_to_y = dict() for node in G.node: node_to_y[node] = G.node[node]['y'] id_to_log2fold = log2FoldChange(G, path_to_csv) # Create color scale with negative as green and positive as red my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ]) # color nodes based on log2fold data node_to_color = dict() for node in G.nodes(): if node in id_to_log2fold: node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode else: node_to_color[node] = '#f1f1f1' # getting nodes in graph nodes = G.nodes() numnodes = len(nodes) node_map = dict(zip(nodes,range(numnodes))) # map to indices for source/target in edges # getting edges in graph edges = G.edges() numedges = len(edges) # dictionaries that hold per node and per edge attributes nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box", "node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes] edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], "color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'', "hidden":'false', "physics":'true'} for i in range(numedges)] # html file label for first graph (must manually increment later) time = 1700 # create graph here #return G return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num", edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15, edge_font_align= "top", physics_enabled = False, graph_width = 1000, graph_height = 1000)
def main(): # Start KEGG interface for querying k = KEGG() # Create a dict to store final network output data = dict() # Create list of hsa (human) pathways list_path = open("hsa_list.txt").read().replace('path:', '').split('\n') # Remove newline list_path.pop() # Read in KEGG reaction ID & reversibility information with open("KEGG_Reac.json", "r") as fp: reac_data = json.load(fp) # Read in KEGG gene data with open("ginfo.json", "r") as fp2: gene_data = json.load(fp2) # Keep track of # of pathways processed i = 0 for hsa in list_path: i += 1 print "# of pathways processed: ", i # Open previously extracted KGML files kgml = open("etc_scripts/KEGG_DB_PATH/pathways/path_" + hsa).read() # Construct element tree root = ET.fromstring(kgml) # Iterate through ALL reactions for reaction in root.findall("./reaction"): gene_ids = [] gene_names = [] subs_list = [] prods_list = [] # 'id' to look up in 'graphics' to extract gene name id_look = reaction.attrib["id"] # Iterate through 'entry' to retrieve gene IDs for entry in root.findall("./entry"): if entry.attrib["id"] == id_look: gene_ids = entry.attrib["name"].split(' ') # Define dict for storing {gene id: reaction id's} r_ids = dict() # Iterate through the gene IDs to retrieve corresponding list of reaction IDs for g_id in gene_ids: r_ids[g_id] = [] # Open previously extracted reaction information with open('reacs/reac_' + g_id, 'r') as rp: line = rp.readline() # With gene ids as key, store corresponding reaction ids while line: r_ids[g_id].append(line.split()[1].split('rn:')[1]) line = rp.readline() # Loop to organize into the final output for g_id, r_ids in r_ids.items(): # Stores reaction ids and their info vals = dict() # Iterate through list of reactions to get metabolite information for r_id in r_ids: # Get the list of substrates and products metabs = get_metabs(k, r_id) # Check if reaction exists in reaction DB if r_id in reac_data.keys(): r_type = reac_data[r_id] else: # If it doesn't exist, assign NA as direction r_type = "NA" # Intermediate result to add to a gene of the current loop iteration vals[r_id] = { "DIRECTION": r_type, "R_SUBS": metabs[0], "R_PROD": metabs[1] } # Check to see if the gene has been encountered previously if g_id in data: # Store the current info to a temp reaction information temp = data[g_id] # Retrieve the current reaction information for the gene temp_list = get_react(temp) # Iterate through the existing information on reaction... # If a new reaction is seen, it is added to temp reaction information for r in vals.keys(): if r not in temp_list: temp[r] = vals[r] # Finalize reaction information to be added to the gene data[g_id] = temp else: data[g_id] = vals with open('keggMetabNetwork.json', 'w') as f: json.dump(data, f)
blast_text = blastHits[ids] else: blast_text = 'NULL' if pfamHits.get(ids) != None: pfam_text = pfamHits[ids] else: pfam_text = 'NULL' if prositeHits.get(ids) != None: prosite_text = prositeHits[ids] else: prosite_text = 'NULL' # Get the KEGG hits kegg = KEGG() kegg_text = '' gene_id = gene_ids[ids] KEGG_IDs = kegg.get_pathway_by_gene(gene_id, "acb") if KEGG_IDs != None: for KEGG_ID in KEGG_IDs: kegg_text += KEGG_IDs[KEGG_ID] + ' [' + KEGG_ID + ']; ' kegg_text = kegg_text[:-2] else: kegg_text = 'NULL' comments = 'NULL' row = ids + '\t' + blast_text + '\t' + pfam_text + '\t' + prosite_text + '\t' + kegg_text + '\t' + GO_IDs + '\t' + comments + '\n' output.write(row) output.close()
def KeggAPI(self): kegg_data = KEGG().parse(KEGG().get(self.kegg_id)) return kegg_data
``` python3 structure_processor.py "" "" --filter_genes "TP53" ``` saves to different files data for gene named "TP53" (this parameter can be comma-separated list of gene names). Saves to pictures/ fragments if they are found. Otherwise saves to different files. after processing saves to "processed_genes.log" gene names from parameter list. To rerun with the same gene list, remove lines corresponding to names from this file or remove the whole file - currently it is used to skip gene names which were already processed. Another call option might be incorrect now. """ from bioservices.kegg import KEGG keggParser = KEGG() import pickle import argparse ORGANISM = "hsa" GENES = ["p53"] # sample gene PDB_PATH = "pdb" import os, prody, pystache, logging if not os.path.exists(PDB_PATH): os.mkdir(PDB_PATH) # TODO: for now I haven't checked if pathPDBFolder creates this folder - # if it is created, this check should be removed. prody.proteins.localpdb.pathPDBFolder(PDB_PATH)
def mapSpecies(mousepeptrackfilename): RETRY_TIME = 20.0 mouseTohumanfilepath = os.path.join(os.getcwd(), 'MouseToHuman.tsv') print("Extracting Mouse to Human Map data, job starts", str(datetime.datetime.now())) #increase the field size of CSV csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) try: urllib.urlretrieve( 'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt', mouseTohumanfilepath) urllib.urlcleanup() except: print("Can't able to download MouseToHuman.tsv file!!") colnameMousHu = [ 'HomoloGene ID', 'Common Organism Name', 'NCBI Taxon ID', 'Symbol', 'EntrezGene ID', 'Mouse MGI ID', 'HGNC ID', 'OMIM Gene ID', 'Genetic Location', 'Genomic Coordinates (mouse: , human: )', 'Nucleotide RefSeq IDs', 'Protein RefSeq IDs', 'SWISS_PROT IDs' ] mouseHumandata = [] homologID = [] with open(mouseTohumanfilepath) as mhtsvfile: mhreader = csv.DictReader(mhtsvfile, delimiter='\t') for mhrow in mhreader: mhtemplist = [] for i in colnameMousHu: mhtempdata = str(mhrow[i]).strip() mhtemplist.append(mhtempdata) if len(mhtemplist[-1].strip()) > 0: homologID.append(mhtemplist[0]) mouseHumandata.append(mhtemplist) homologID = list(set(homologID)) homologID.sort() mousehumandic = {} for homologidItem in homologID: tempHumanHomoUniID = '' tempMouseHomoUniID = '' for item in mouseHumandata: if homologidItem == item[0]: if 'mouse' in item[1].strip().lower(): tempMouseHomoUniID = item[-1].strip() else: tempHumanHomoUniID = item[-1].strip() if len(tempMouseHomoUniID.strip()) > 0 and len( tempHumanHomoUniID.strip()) > 0 and tempHumanHomoUniID.strip( ).upper() != 'NA': mousehumandic[tempMouseHomoUniID] = tempHumanHomoUniID colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\ 'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\ 'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession'] finalresult = [] finalresult.append(colname) humanUniprotID = [] with open(mousepeptrackfilename) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: templist = [] for i in colname[:-1]: tempdata = str(row[i]).strip() templist.append(tempdata) if len(str(templist[0]).strip()) > 0: if templist[0].split('-')[0] in mousehumandic: humanUniprotID.append( mousehumandic[templist[0].split('-')[0]]) templist.append(mousehumandic[templist[0].split('-')[0]]) else: templist.append('NA') finalresult.append(templist) with open(mousepeptrackfilename, 'wb') as pf: pwriter = csv.writer(pf, delimiter='\t') pwriter.writerows(finalresult) disGenDataDicName = disGenData() #disGenDataDicName='disGen.obj' disGenDataDic = cPickle.load(open(disGenDataDicName, 'rb')) unqhumanUniprotID = list(set(humanUniprotID)) humanUniprotfuncinfodic = {} countProt = 0 for subcode in unqhumanUniprotID: time.sleep(2) drugbanklist = [] PN = 'NA' GN = 'NA' OG = 'NA' OGID = 'NA' dislist = [] unidislist = [] unidisURLlist = [] disgendislist = [] disgendisURLlist = [] GoIDList = [] GoNamList = [] GoTermList = [] GOinfo = [] try: countProt += 1 if countProt % 1000 == 0: print str( countProt ), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts", str( datetime.datetime.now()) SGrequestURL = "https://www.uniprot.org/uniprot/" + str( subcode) + ".xml" SGunifile = urllib.urlopen(SGrequestURL) SGunidata = SGunifile.read() SGunifile.close() try: SGunidata = minidom.parseString(SGunidata) try: drugdata = (SGunidata.getElementsByTagName('dbReference')) for duItem in drugdata: if (duItem.attributes['type'].value ).upper() == 'DRUGBANK': try: drugname = (str( duItem.getElementsByTagName('property') [0].attributes['value'].value).strip()) drugid = str( duItem.attributes['id'].value).strip() durl = '<a target="_blank" href="https://www.drugbank.ca/drugs/' + drugid + '">' + drugname + '</a>' drugbanklist.append(durl) except: pass if (duItem.attributes['type'].value ).strip() == 'NCBI Taxonomy': try: OGID = str( duItem.attributes['id'].value).strip() except: pass except IndexError: pass try: godata = (SGunidata.getElementsByTagName('dbReference')) for gItem in godata: if (gItem.attributes['type'].value).upper() == 'GO': try: gonamedetails = (str( gItem.getElementsByTagName('property') [0].attributes['value'].value).strip() ).split(':')[1] gotermdetails = (str( gItem.getElementsByTagName('property') [0].attributes['value'].value).strip() ).split(':')[0] GoNamList.append(gonamedetails) goid = str( gItem.attributes['id'].value).strip() GoIDList.append(goid) tempGoTerm = None if gotermdetails.lower() == 'p': tempGoTerm = 'Biological Process' if gotermdetails.lower() == 'f': tempGoTerm = 'Molecular Function' if gotermdetails.lower() == 'c': tempGoTerm = 'Cellular Component' GoTermList.append(tempGoTerm) tempGOData = gonamedetails + ';' + goid + ';' + tempGoTerm GOinfo.append(tempGOData) except: pass if (gItem.attributes['type'].value ).strip() == 'NCBI Taxonomy': try: OGID = str( gItem.attributes['id'].value).strip() except: pass except IndexError: pass try: try: PN = (((SGunidata.getElementsByTagName('protein')[0] ).getElementsByTagName('recommendedName')[0] ).getElementsByTagName('fullName')[0] ).firstChild.nodeValue except: PN = (((SGunidata.getElementsByTagName('protein')[0] ).getElementsByTagName('submittedName')[0] ).getElementsByTagName('fullName')[0] ).firstChild.nodeValue except IndexError: pass try: try: GN = (( SGunidata.getElementsByTagName('gene')[0] ).getElementsByTagName('name')[0]).firstChild.nodeValue except: GN = 'NA' except IndexError: pass try: try: OG = (( SGunidata.getElementsByTagName('organism')[0] ).getElementsByTagName('name')[0]).firstChild.nodeValue except: OG = 'NA' except IndexError: pass try: disdata = SGunidata.getElementsByTagName('disease') for dItem in disdata: disname = '' disshort = '' disURL = '' disID = '' try: disname = (dItem.getElementsByTagName('name')[0] ).firstChild.nodeValue disID = (dItem.attributes['id'].value).upper() except: pass try: disshort = (dItem.getElementsByTagName('acronym') [0]).firstChild.nodeValue except: pass if len(disname.strip()) > 0: disURL = '<a target="_blank" href="https://www.uniprot.org/diseases/' + disID + '">' + str( disname.strip()) + '(' + str( disshort) + ')' + '</a>' dislist.append( str(disname.strip()) + '(' + str(disshort) + ')') unidislist.append( str(disname.strip()) + '(' + str(disshort) + ')') unidisURLlist.append(disURL) except IndexError: pass except ExpatError: pass except IOError: pass drugbankdata = 'NA' disdata = 'NA' uniDisData = 'NA' uniDisURLData = 'NA' disgenDisData = 'NA' disgenDisURLData = 'NA' goiddata = 'NA' gonamedata = 'NA' gotermdata = 'NA' goData = 'NA' if GN != 'NA' and GN in disGenDataDic: disgendislist = disGenDataDic[GN][0] disgendisURLlist = disGenDataDic[GN][1] if len(dislist) > 0: dislist = dislist + disGenDataDic[GN][0] else: dislist = disGenDataDic[GN][0] if len(GoIDList) > 0: goiddata = '|'.join(list(set(GoIDList))) if len(GoNamList) > 0: gonamedata = '|'.join(list(set(GoNamList))) if len(GoTermList) > 0: gotermdata = '|'.join(list(set(GoTermList))) if len(GOinfo) > 0: goData = '|'.join(list(set(GOinfo))) if len(drugbanklist) > 0: drugbankdata = '|'.join(list(set(drugbanklist))) if len(dislist) > 0: disdata = '|'.join(list(set(dislist))) if len(unidislist) > 0: uniDisData = '|'.join(list(set(unidislist))) if len(unidisURLlist) > 0: uniDisURLData = '|'.join(list(set(unidisURLlist))) if len(disgendislist) > 0: disgenDisData = '|'.join(list(set(disgendislist))) if len(disgendisURLlist) > 0: disgenDisURLData = '|'.join(list(set(disgendisURLlist))) humanUniprotfuncinfodic[subcode] = [ PN, GN, OG, OGID, disdata, uniDisData, uniDisURLData, disgenDisData, disgenDisURLData, drugbankdata, goiddata, gonamedata, gotermdata, goData ] hudicfile = 'humanUniprotfuncinfodic.obj' hudicf = open(hudicfile, 'wb') pickle.dump(humanUniprotfuncinfodic, hudicf, pickle.HIGHEST_PROTOCOL) hudicf.close() print("Extracting KEGG pathway name, job starts", str(datetime.datetime.now())) hkeggdictfile = {} huniproturl = 'https://www.uniprot.org/uploadlists/' hk = KEGG() for hkx in range(0, len(unqhumanUniprotID), 2000): countProt += hkx + 2000 if countProt % 2000 == 0: print(str(countProt), "th protein kegg job starts", str(datetime.datetime.now())) huniprotcodes = ' '.join(unqhumanUniprotID[hkx:hkx + 2000]) huniprotparams = { 'from': 'ACC', 'to': 'KEGG_ID', 'format': 'tab', 'query': huniprotcodes } while True: try: hkuniprotdata = urllib.urlencode(huniprotparams) hkuniprotrequest = urllib2.Request(huniproturl, hkuniprotdata) hkuniprotresponse = urllib2.urlopen(hkuniprotrequest) for hkuniprotline in hkuniprotresponse: hkudata = hkuniprotline.strip() if not hkudata.startswith("From"): hkuinfo = hkudata.split("\t") if len(hkuinfo[1].strip()): hkegg = hk.get(hkuinfo[1].strip()) hkudict_data = hk.parse(hkegg) try: try: if len(str(hkuinfo[0]).strip()) > 5: tempkeggData = '|'.join( '{};{}'.format(key, value) for key, value in hkudict_data['PATHWAY'].items()) hkeggdictfile[hkuinfo[0].strip()] = [ hkudict_data['PATHWAY'].values(), tempkeggData ] except TypeError: pass except KeyError: pass break except urllib2.HTTPError: time.sleep(RETRY_TIME) print( 'Hey, I am trying again until succeeds to get data from KEGG!', str(datetime.datetime.now())) pass hkdicfile = 'humankeggdic.obj' hkdicf = open(hkdicfile, 'wb') pickle.dump(hkeggdictfile, hkdicf, pickle.HIGHEST_PROTOCOL) hkdicf.close()