def create_background_network(save_name='background_network'): """ Parameters ---------- save_name : str Returns ------- """ reactome_network = load_reactome_fi() kegg_network = load_all_of_kegg(fresh_download=True) hmdb_network = load_hmdb_network(fresh_download=True) biogrid_network = load_biogrid_network(fresh_download=True) signor_network = load_signor(fresh_download=True) def find_overlap(n1, n2): nodes1 = set(n1.nodes()) nodes2 = set(n2.nodes()) e1 = set(n1.edges()) e2 = set(n2.edges()) print("\tnode overlap = {}".format(len(nodes1.intersection(nodes2)))) print("\tnode difference = {} | {}".format( len(nodes1.intersection(nodes2)), len(nodes2.intersection(nodes1))) ) print("\tedge overlap = {}".format(len(e2.intersection(e1)))) print("\tedge difference = {} | {}".format(len(e2.difference(e1)), len(e2.difference(e1)))) network_list = [hmdb_network, kegg_network, biogrid_network, reactome_network, signor_network] names = ['hmdb', 'kegg', 'biogrid', 'reactome', 'signor'] for i, n in zip(network_list, names): for j, m in zip(network_list, names): if n != m: print('{} : {}'.format(n, m)) find_overlap(i, j) full_network = nt.compose_all( [hmdb_network, kegg_network, biogrid_network, reactome_network, signor_network] ) nt.delete_disconnected_network(full_network) nt.standardize_edge_types(full_network) # find_overlap(reactome_network, full_network) n_nodes = len(full_network.nodes) n_edges = len(full_network.edges) print("Background network {} nodes and {} edges".format(n_nodes, n_edges)) nx.write_gpickle(full_network, '{}.p.gz'.format(save_name))
def build_network(seed_species, species='hsa', save_name=None, all_measured_list=None, trim_source_sink=False, use_reactome=True, use_hmdb=False, use_biogrid=True, use_signor=True, verbose=False): """ Construct a network from a list of gene names. Parameters ---------- seed_species : list list of genes to construct network save_name : str, optional output name to save network. Will save one before and after ID conversion species : str species of proteins ('hsa': human, 'mmu':murine) all_measured_list : list list of all species that should be considered in network use_reactome : bool Add ReactomeFunctionalInteraction reaction to network use_biogrid : bool Add BioGrid reaction to network use_hmdb : bool Add HMDB reaction to network all_measured_list use_signor : bool Add SIGNOR reaction to network trim_source_sink : bool, optional Remove source and sink nodes if they are not measured in network verbose : bool Returns ------- networkx.DiGraph """ path_to_graph, node_to_path = load_kegg_mappings(species, verbose=False) seed_species = set(x.upper() for x in seed_species) updated_accession = set() old_accession = set() for i in seed_species: if i.startswith('HMDB'): if i in cm.hmdb_accession_to_main: old_accession.add(i) updated_accession.add(cm.hmdb_accession_to_main[i][0]) seed_species.difference_update(old_accession) seed_species.update(updated_accession) seeds_in_kegg = seed_species.intersection(node_to_path) pathway_list = set() for seed in seeds_in_kegg: pathway_list.update(node_to_path[seed]) graph_list = [] for each in pathway_list: tmp = path_to_graph[each] if len(tmp.edges) == 0: continue graph_list.append(tmp) end_network = nt.compose_all(graph_list) if all_measured_list is None: all_measured_set = set(i.upper() for i in end_network.nodes) else: all_measured_set = set(str(x).upper() for x in all_measured_list) all_measured_set.update(seed_species) hmdb_ids = set(i for i in all_measured_set if i.startswith('HMDB')) updated_accession = set() old_accession = set() for i in hmdb_ids: if i in cm.hmdb_accession_to_main: all_measured_set.remove(i) all_measured_set.add(cm.hmdb_accession_to_main[i][0]) networks_to_expand = [] if use_hmdb: networks_to_expand.append(load_hmdb_network(verbose)) if use_reactome: networks_to_expand.append(load_reactome_fi(verbose)) if use_biogrid: networks_to_expand.append(load_biogrid_network(verbose)) if use_signor: networks_to_expand.append(load_signor(verbose)) if len(networks_to_expand) != 0: entire_expansion_network = nt.compose_all(networks_to_expand) end_network = expand_by_db(end_network, entire_expansion_network, all_measured_set) print("Trimming network") # makes all similar edge names the same nt.standardize_edge_types(end_network) # removes everything not connected to the largest graph end_network = nt.delete_disconnected_network(end_network) if trim_source_sink: end_network = nt.trim_sink_source_nodes(end_network, all_measured_list, remove_self_edge=True) if save_name is not None: nx.write_gml(end_network, '{}.gml'.format(save_name)) nx.write_gpickle(end_network, '{}.p'.format(save_name)) final_nodes = set(end_network.nodes) n_hits = len(seed_species.intersection(final_nodes)) print('Network has {} nodes and {} edges'.format(len(final_nodes), len(end_network.edges))) print("Found {} of {} seed species in network" "".format(n_hits, len(seed_species))) if all_measured_list is not None: n_measured_hits = len(set(all_measured_list).intersection(final_nodes)) print("Found {} of {} background species in network" "".format(n_measured_hits, len(all_measured_list))) return end_network
def create_background_network(save_name='background_network', fresh_download=False, verbose=True, create_overlap=False): """ Parameters ---------- save_name : str Name of the network fresh_download : bool Download a fresh copy of the databases verbose: bool Print information about the databases create_overlap : bool Creates a figure comparing the databses Returns ------- nx.DiGraph """ kegg_network = db.load_all_of_kegg(fresh_download=fresh_download, verbose=verbose) hmdb_network = db.load_hmdb_network(fresh_download=fresh_download, verbose=verbose) biogrid_network = db.load_biogrid_network(fresh_download=fresh_download, verbose=verbose) signor_network = db.load_signor(fresh_download=fresh_download, verbose=verbose) reactome_network = db.load_reactome_fi(verbose=verbose) network_list = [ hmdb_network, kegg_network, biogrid_network, reactome_network, signor_network ] names = ['hmdb', 'kegg', 'biogrid', 'reactome', 'signor'] def find_overlap(n1, n2): nodes1 = set(n1.nodes()) nodes2 = set(n2.nodes()) e1 = set(n1.edges()) e2 = set(n2.edges()) edge_overlap = len(e2.intersection(e1)) node_overlap = len(nodes1.intersection(nodes2)) print("\tnode overlap = {}".format(node_overlap)) print("\tedge overlap = {}".format(edge_overlap)) return node_overlap, edge_overlap if create_overlap: db_maps = {i: j for i, j in zip(names, network_list)} n_dbs = len(names) pal = sns.light_palette("purple", as_cmap=True) node_mat = np.zeros((n_dbs, n_dbs), dtype=np.int) edge_mat = np.zeros((n_dbs, n_dbs), dtype=np.int) for i in range(n_dbs): row = db_maps[names[i]] for j in range(i + 1, n_dbs): col = db_maps[names[j]] n_overlap, e_overlap = find_overlap(row, col) node_mat[i, j] = n_overlap node_mat[j, i] = node_mat[i, j] edge_mat[i, j] = e_overlap edge_mat[j, i] = edge_mat[i, j] fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(121) plt.title("Number of node overlaps") sns.heatmap(node_mat, fmt='d', annot=True, linewidths=0.02, cmap=pal, yticklabels=names, xticklabels=names) plt.yticks(rotation=0) ax = fig.add_subplot(122) plt.title("Number of edge overlaps") sns.heatmap(edge_mat, fmt='d', annot=True, linewidths=0.02, cmap=pal, yticklabels=names, xticklabels=names) plt.tight_layout() plt.yticks(rotation=0) plt.subplots_adjust(wspace=.3) plt.savefig('compare_network_dbs.png', dpi=300, bbox_inches='tight') plt.close() full_network = nt.compose_all(network_list) nt.standardize_edge_types(full_network) full_network = nt.delete_disconnected_network(full_network) # find_overlap(reactome_network, full_network) n_nodes = len(full_network.nodes) n_edges = len(full_network.edges) print("Background network {} nodes and {} edges".format(n_nodes, n_edges)) nx.write_gpickle(full_network, '{}.p.gz'.format(save_name)) nx.write_gml(full_network, '{}.gml'.format(save_name)) return full_network
def download_network_dbs(): nd.load_reactome_fi() nd.download_signor() nd.load_biogrid_network() dl.HMDB()