def run(args): # Load input graph print "* Loading input graph..." with open(args.edgelist_file) as infile: G = nx.Graph() G.add_edges_from([map(int, l.rstrip().split()[:2]) for l in infile]) print "\t{} nodes with {} edges".format(len(G.nodes()), len(G.edges())) # Remove self-loops and zero degree nodes, and # restrict to the largest connected component print "* Removing self-loops, zero degree nodes, and ", print "restricting to the largest connected component" G.remove_edges_from([(u,v) for u, v in G.edges() if u == v]) G.remove_nodes_from([n for n in G.nodes() if G.degree(n) == 0]) G = G.subgraph(sorted(nx.connected_components( G ), key=lambda cc: len(cc), reverse=True)[0]) print "\t{} nodes with {} edges remaining".format(len(G.nodes()), len(G.edges())) # Load gene index indexToGene = hnio.load_index(args.gene_index_file) # Compute and save Laplacian if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print "* Computing Laplacian..." L = nx.laplacian_matrix(G) # Exponentiate the Laplacian for the given time and save it print "* Computing diffusion matrix..." Li = expm_eig( -args.time * L.todense() ) #Li = sp.sparse.linalg.expm( -args.time * L) output_prefix = "{}/{}_inf_{}".format(args.output_dir, args.prefix, args.time) if args.format == 'hdf5': hnio.save_hdf5(output_prefix + ".h5", dict(Li=Li)) elif args.format == 'npy': np.save(output_prefix + ".npy", Li) # Save the index to gene mapping indexOutputFile = "{}/{}_index_genes".format(args.output_dir, args.prefix) nodes = G.nodes() geneIndexOutput = ["{} {}".format(i+args.start_index, indexToGene[node]) for i, node in enumerate(nodes)] hnio.write_file(indexOutputFile, "\n".join(geneIndexOutput)) # Create edge list with revised indices edgeIndices = [] for u, v in G.edges(): i = nodes.index(u) + args.start_index j = nodes.index(v) + args.start_index edgeIndices.append( sorted([i, j]) ) edgeOutputFile = "{}/{}_edge_list".format(args.output_dir, args.prefix) edgeOutput = ["{} {} 1".format(u, v) for u, v in edgeIndices] hnio.write_file(edgeOutputFile, "\n".join(edgeOutput))
def run(args): # Load gene-index map with open(args.gene_index_file) as infile: arrs = [l.rstrip().split() for l in infile] indexToGene = dict((int(arr[0]), arr[1]) for arr in arrs) G = nx.Graph() G.add_nodes_from( indexToGene.values()) # in case any nodes have degree zero # Load graph print "* Loading PPI..." with open(args.edgelist_file) as infile: edges = [map(int, l.rstrip().split()[:2]) for l in infile] G.add_edges_from([(indexToGene[u], indexToGene[v]) for u, v in edges]) print "\t- Edges:", len(G.edges()) print "\t- Nodes:", len(G.nodes()) # Remove self-loops and restrict to largest connected component print "* Removing self-loops, multi-edges, and restricting to", print "largest connected component..." selfLoops = [(u, v) for u, v in G.edges() if u == v] G.remove_edges_from(selfLoops) G = G.subgraph( sorted(nx.connected_components(G), key=lambda cc: len(cc), reverse=True)[0]) nodes = sorted(G.nodes()) n = len(nodes) print "\t- Largest CC Edges:", len(G.edges()) print "\t- Largest CC Nodes:", len(G.nodes()) # Set up output directory print "* Saving updated graph to file..." os.system('mkdir -p ' + args.output_dir) output_dir = os.path.normpath(os.getcwd() + "/" + args.output_dir) output_prefix = "{}/{}".format(output_dir, args.prefix) if args.format == 'hdf5': ext = 'h5' elif args.format == 'matlab': ext = 'mat' else: ext = args.format pprfile = "{}_ppr_{:g}.{}".format(output_prefix, args.beta, ext) # Index mapping for genes index_map = [ "{} {}".format(i + args.start_index, nodes[i]) for i in range(n) ] with open("{}_index_genes".format(output_prefix), 'w') as outfile: outfile.write("\n".join(index_map)) # Edge list edges = [ sorted([ nodes.index(u) + args.start_index, nodes.index(v) + args.start_index ]) for u, v in G.edges() ] edgelist = ["{} {} 1".format(u, v) for u, v in edges] with open("{}_edge_list".format(output_prefix), 'w') as outfile: outfile.write("\n".join(edgelist)) ## Create the PPR matrix either using Scipy or MATLAB # Create "walk" matrix (normalized adjacency matrix) print "* Creating PPR matrix..." W = nx.to_numpy_matrix(G, nodelist=nodes, dtype=np.float64) W = np.asarray(W) W = W / W.sum(axis=1) # normalization step ## Create PPR matrix using Python from scipy.linalg import inv PPR = args.beta * inv(sp.eye(n) - (1. - args.beta) * W) if args.format == 'hdf5': hnio.save_hdf5(pprfile, dict(PPR=PPR)) elif args.format == 'npy': np.save(pprfile, PPR) elif args.format == 'matlab': scipy.io.savemat(pprfile, dict(PPR=PPR))
def run(args): # Load gene-index map with open(args.gene_index_file) as infile: arrs = [ l.rstrip().split() for l in infile ] indexToGene = dict((int(arr[0]), arr[1]) for arr in arrs) G = nx.Graph() G.add_nodes_from( indexToGene.values() ) # in case any nodes have degree zero # Load graph print "* Loading PPI..." with open(args.edgelist_file) as infile: edges = [ map(int, l.rstrip().split()[:2]) for l in infile ] G.add_edges_from( [(indexToGene[u], indexToGene[v]) for u,v in edges] ) print "\t- Edges:", len(G.edges()) print "\t- Nodes:", len(G.nodes()) # Remove self-loops and restrict to largest connected component print "* Removing self-loops, multi-edges, and restricting to", print "largest connected component..." selfLoops = [(u, v) for u, v in G.edges() if u == v] G.remove_edges_from( selfLoops ) G = G.subgraph( sorted(nx.connected_components( G ), key=lambda cc: len(cc), reverse=True)[0] ) nodes = sorted(G.nodes()) n = len(nodes) print "\t- Largest CC Edges:", len( G.edges() ) print "\t- Largest CC Nodes:", len( G.nodes() ) # Set up output directory print "* Saving updated graph to file..." os.system( 'mkdir -p ' + args.output_dir ) output_dir = os.path.normpath(os.getcwd() + "/" + args.output_dir) output_prefix = "{}/{}".format(output_dir, args.prefix) if args.format == 'hdf5': ext = 'h5' elif args.format == 'matlab': ext = 'mat' else: ext = args.format pprfile = "{}_ppr_{:g}.{}".format(output_prefix, args.beta, ext) # Index mapping for genes index_map = [ "{} {}".format(i+args.start_index, nodes[i]) for i in range(n) ] with open("{}_index_genes".format(output_prefix), 'w') as outfile: outfile.write( "\n".join(index_map) ) # Edge list edges = [sorted([nodes.index(u) + args.start_index, nodes.index(v) + args.start_index]) for u, v in G.edges()] edgelist = [ "{} {} 1".format(u, v) for u, v in edges ] with open("{}_edge_list".format(output_prefix), 'w') as outfile: outfile.write( "\n".join(edgelist) ) ## Create the PPR matrix either using Scipy or MATLAB # Create "walk" matrix (normalized adjacency matrix) print "* Creating PPR matrix..." W = nx.to_numpy_matrix( G , nodelist=nodes, dtype=np.float64 ) W = np.asarray(W) W = W / W.sum(axis=1) # normalization step ## Create PPR matrix using Python from scipy.linalg import inv PPR = args.beta*inv(sp.eye(n)-(1.-args.beta)*W) if args.format == 'hdf5': hnio.save_hdf5(pprfile, dict(PPR=PPR)) elif args.format == 'npy': np.save(pprfile, PPR) elif args.format == 'matlab': scipy.io.savemat(pprfile, dict(PPR=PPR))