""" KEGG module example ==================== Histogram of KEGG pathways relations """ ################################################# # from pylab import * # extract all relations from all pathways from bioservices.kegg import KEGG s = KEGG() s.organism = "hsa" # retrieve more than 260 pathways so it takes time max_pathways = 10 results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]] relations = [x['relations'] for x in results] # plot hist([len(this) for this in relations], 20) xlabel('number of relations') ylabel('#') title("number of relations per pathways") grid(True)
class KEGGPathways: """ KEGG PATHWAY Database API """ def __init__(self, organism="H**o sapiens"): self.database = KEGG() self.organism = self.get_organism_code(organism.lower()) def search_by_gene(self, gene_name: str): """ Args: gene_name: gene name (ex. 'BRCA2') Returns: Dictionary with ids of all pathways containing given gene as keys and their full names as values. """ try: pathways = self.database.get_pathway_by_gene( gene_name, self.organism) return pathways if pathways else {} except AttributeError: return {} def get_pathway(self, pathway_id: str, self_loops: bool = False): """ Args: pathway_id: KEGG pathway id (ex. 'hsa04110') self_loops: information about whether or not include self loops in returned graph Returns: `networkx.DiGraph` object: Directed graph depicting pathway, with a comma-separated string containing gene names as graph nodes and directed edges representing interactions between genes. Each edge has weight 'type', which is a list of interaction types between two nodes. """ G = nx.DiGraph() try: pathway = self.database.parse_kgml_pathway(pathway_id) except TypeError: # incorrect pathway_id pathway = None if pathway: names = {} for entry in pathway['entries']: # only intra-pathway interactions taken into account if entry['gene_names']: names[entry['id']] = { 'name': entry['gene_names'], 'type': entry['type'] } for rel in pathway['relations']: if rel['entry1'] in names.keys( ) and rel['entry2'] in names.keys(): e1 = names[rel['entry1']]['name'] e2 = names[rel['entry2']]['name'] G.add_node(e1, type=names[rel['entry1']]['type']) G.add_node(e2, type=names[rel['entry2']]['type']) if G.has_edge(e1, e2): G[e1][e2]['type'] = G[e1][e2]['type'] + [rel['name']] else: # assumption of interaction direction entry1 -> entry2 #TODO: validate if e1 != e2 or (e1 == e2 and self_loops): G.add_edge(e1, e2, type=[rel['name']]) not_gene_nodes = [] for node in G.nodes(): # only interactions between genes if G.node[node]['type'] != 'gene': for in_edge in G.in_edges(node): for out_edge in G.out_edges(node): if in_edge[0] != out_edge[1] or ( in_edge[0] == out_edge[1] and self_loops): G.add_edge(in_edge[0], out_edge[1], type=['indirect']) not_gene_nodes.append(node) G.remove_nodes_from(not_gene_nodes) return G def fetch_organism_codes(self): """ Returns: Dictionary with organisms as keys, and KEGG organism codes as values { 'h**o sapiens' : 'hsa', 'human' : 'hsa', ... } """ codes = {} for line in self.database.list('organism').split('\n'): if line: code = line.split('\t')[1] org = line.split('\t')[2] if '(' in org: org = [x.strip().lower() for x in org[:-1].split('(')] for o in org: codes[o] = code else: codes[org] = code return codes def get_organism_code(self, org: str): """ Args: org: organism name (ex. 'H**o sapiens', 'human') - lowercase and uppercase optional Returns: str: KEGG organism code """ codes = self.fetch_organism_codes() try: return codes[org] except KeyError: print('Invalid organism name.') raise def get_gene_code(self, gen: str): """ Args: gen: gene name (ex. 'FGR', 'NIPAL1') Returns: KEGG gene code """ code_gen = self.database.find(self.organism, gen) if code_gen == str('\n'): code_gen = str() print('Invalid gene name: ' + str(gen)) return code_gen
def pathwayVisualization(KEGG_id, path_to_csv, redirect=True, compound=False): """ The pathwayVisualization function returns a graph visualization based on user input Args: KEGG_id (str): string specifying KEGG pathway ID to visualize path_to_csv (str): string specifying data to overlay on graph redirect (bool): True to split nodes into their components. Defaults to True compound (bool): True to display compounds (such as Ca2+). Defaults to False Returns: A graph visualization using the visjs_network function from visjs_2_jupyter """ s = KEGG() res = s.get(KEGG_id, "kgml") if res == 404 or res == 400: print KEGG_id + ' is not a valid KEGG ID' return result = s.parse_kgml_pathway(KEGG_id) ETroot = parsingXML(KEGG_id, s) G=nx.DiGraph() max_id, compound_array = addNodes(G, result) setCoord(G, ETroot) if redirect is False: getNodeSymbols(G, s, compound) else: parent_list, parent_dict = splitNodes(G, s, max_id) complex_array, component_array, node_dict, comp_dict = undefNodes(G, ETroot) if redirect is False: addEdges(G, result, component_array, node_dict) else: addAndRedirectEdges(G, result, complex_array, component_array, parent_list, parent_dict, node_dict, comp_dict) #add reactions to graph addReaction(G, ETroot) edge_to_name = dict() for edge in G.edges(): if G.edge[edge[0]][edge[1]]['name'] == 'phosphorylation': edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value'] elif G.edge[edge[0]][edge[1]]['name'] == 'dephosphorylation': edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value'] elif 'dephosphorylation' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('dephosphorylation', '-p') elif 'phosphorylation' in G.edge[edge[0]][edge[1]]['name']: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('phosphorylation', '+p') else: edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'] edge_to_name[edge] = edge_to_name[edge].replace('activation, ', '') edge_to_name[edge] = edge_to_name[edge].replace('inhibition, ', '') edge_to_name[edge] = edge_to_name[edge].replace('activation', '') edge_to_name[edge] = edge_to_name[edge].replace('inhibition', '') #edges are transparent edge_to_color = dict() for edge in G.edges(): if 'activation' in G.edge[edge[0]][edge[1]]['name']: edge_to_color[edge] = 'rgba(26, 148, 49, 0.3)' #green elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']: edge_to_color[edge] = 'rgba(255, 0, 0, 0.3)' #red else: edge_to_color[edge] = 'rgba(0, 0, 255, 0.3)' #blue #for graph with split nodes if redirect is True: #remove undefined nodes from graph G.remove_nodes_from(complex_array) #remove nodes with more than one gene G.remove_nodes_from(parent_list) if compound is False: #remove compound nodes G.remove_nodes_from(compound_array) node_to_symbol = dict() for node in G.node: if G.node[node]['type'] == 'map': node_to_symbol[node] = G.node[node]['gene_names'] else: if 'symbol' in G.node[node]: node_to_symbol[node] = G.node[node]['symbol'] elif 'gene_names'in G.node[node]: node_to_symbol[node] = G.node[node]['gene_names'] else: node_to_symbol[node] = G.node[node]['name'] # getting name of nodes node_to_gene = dict() for node in G.node: node_to_gene[node] = G.node[node]['gene_names'] # getting x coord of nodes node_to_x = dict() for node in G.node: node_to_x[node] = G.node[node]['x'] # getting y coord of nodes node_to_y = dict() for node in G.node: node_to_y[node] = G.node[node]['y'] id_to_log2fold = log2FoldChange(G, path_to_csv) # Create color scale with negative as green and positive as red my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ]) # color nodes based on log2fold data node_to_color = dict() for node in G.nodes(): if node in id_to_log2fold: node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode else: node_to_color[node] = '#f1f1f1' # getting nodes in graph nodes = G.nodes() numnodes = len(nodes) node_map = dict(zip(nodes,range(numnodes))) # map to indices for source/target in edges # getting edges in graph edges = G.edges() numedges = len(edges) # dictionaries that hold per node and per edge attributes nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box", "node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes] edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], "color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'', "hidden":'false', "physics":'true'} for i in range(numedges)] # html file label for first graph (must manually increment later) time = 1700 # create graph here #return G return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num", edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15, edge_font_align= "top", physics_enabled = False, graph_width = 1000, graph_height = 1000)
""" KEGG module example ==================== Histogram of KEGG pathways relations """ ################################################# # from pylab import * # extract all relations from all pathways from bioservices.kegg import KEGG s = KEGG() s.organism = "hsa" # retrieve more than 260 pathways so it takes time max_pathways = 10 results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]] relations = [x['relations'] for x in results] # plot hist([len(this) for this in relations], 20) xlabel('number of relations') ylabel('#') title("number of relations per pathways") grid(True)