Beispiel #1
0
def run(args):
    # Load input graph
    print "* Loading input graph..."
    with open(args.edgelist_file) as infile:
        G = nx.Graph()
        G.add_edges_from([map(int, l.rstrip().split()[:2]) for l in infile])
        print "\t{} nodes with {} edges".format(len(G.nodes()), len(G.edges()))

    # Remove self-loops and zero degree nodes, and
    # restrict to the largest connected component
    print "* Removing self-loops, zero degree nodes, and ",
    print "restricting to the largest connected component"
    G.remove_edges_from([(u,v) for u, v in G.edges() if u == v])
    G.remove_nodes_from([n for n in G.nodes() if G.degree(n) == 0])
    G = G.subgraph(sorted(nx.connected_components( G ), key=lambda cc: len(cc), reverse=True)[0])

    print "\t{} nodes with {} edges remaining".format(len(G.nodes()), len(G.edges()))

    # Load gene index
    indexToGene = hnio.load_index(args.gene_index_file)

    # Compute and save Laplacian
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    
    print "* Computing Laplacian..."
    L = nx.laplacian_matrix(G)

    # Exponentiate the Laplacian for the given time and save it
    print "* Computing diffusion matrix..."
    Li = expm_eig( -args.time * L.todense() )
    #Li = sp.sparse.linalg.expm( -args.time * L)
    output_prefix = "{}/{}_inf_{}".format(args.output_dir, args.prefix, args.time)
    if args.format == 'hdf5':
        hnio.save_hdf5(output_prefix + ".h5", dict(Li=Li))
    elif args.format == 'npy':
        np.save(output_prefix + ".npy", Li)

    # Save the index to gene mapping
    indexOutputFile = "{}/{}_index_genes".format(args.output_dir, args.prefix)
    nodes = G.nodes()
    geneIndexOutput = ["{} {}".format(i+args.start_index, indexToGene[node])
                         for i, node in enumerate(nodes)]
    hnio.write_file(indexOutputFile, "\n".join(geneIndexOutput))

    # Create edge list with revised indices
    edgeIndices = []
    for u, v in G.edges():
        i = nodes.index(u) + args.start_index
        j = nodes.index(v) + args.start_index
        edgeIndices.append( sorted([i, j]) )
    edgeOutputFile = "{}/{}_edge_list".format(args.output_dir, args.prefix)
    edgeOutput = ["{} {} 1".format(u, v) for u, v in edgeIndices]
    hnio.write_file(edgeOutputFile, "\n".join(edgeOutput))
Beispiel #2
0
def run(args):
    # Load gene-index map
    with open(args.gene_index_file) as infile:
        arrs = [l.rstrip().split() for l in infile]
        indexToGene = dict((int(arr[0]), arr[1]) for arr in arrs)

    G = nx.Graph()
    G.add_nodes_from(
        indexToGene.values())  # in case any nodes have degree zero

    # Load graph
    print "* Loading PPI..."
    with open(args.edgelist_file) as infile:
        edges = [map(int, l.rstrip().split()[:2]) for l in infile]
    G.add_edges_from([(indexToGene[u], indexToGene[v]) for u, v in edges])

    print "\t- Edges:", len(G.edges())
    print "\t- Nodes:", len(G.nodes())

    # Remove self-loops and restrict to largest connected component
    print "* Removing self-loops, multi-edges, and restricting to",
    print "largest connected component..."
    selfLoops = [(u, v) for u, v in G.edges() if u == v]
    G.remove_edges_from(selfLoops)
    G = G.subgraph(
        sorted(nx.connected_components(G),
               key=lambda cc: len(cc),
               reverse=True)[0])
    nodes = sorted(G.nodes())
    n = len(nodes)
    print "\t- Largest CC Edges:", len(G.edges())
    print "\t- Largest CC Nodes:", len(G.nodes())

    # Set up output directory
    print "* Saving updated graph to file..."
    os.system('mkdir -p ' + args.output_dir)
    output_dir = os.path.normpath(os.getcwd() + "/" + args.output_dir)
    output_prefix = "{}/{}".format(output_dir, args.prefix)

    if args.format == 'hdf5': ext = 'h5'
    elif args.format == 'matlab': ext = 'mat'
    else: ext = args.format

    pprfile = "{}_ppr_{:g}.{}".format(output_prefix, args.beta, ext)

    # Index mapping for genes
    index_map = [
        "{} {}".format(i + args.start_index, nodes[i]) for i in range(n)
    ]
    with open("{}_index_genes".format(output_prefix), 'w') as outfile:
        outfile.write("\n".join(index_map))

    # Edge list
    edges = [
        sorted([
            nodes.index(u) + args.start_index,
            nodes.index(v) + args.start_index
        ]) for u, v in G.edges()
    ]
    edgelist = ["{} {} 1".format(u, v) for u, v in edges]

    with open("{}_edge_list".format(output_prefix), 'w') as outfile:
        outfile.write("\n".join(edgelist))

    ## Create the PPR matrix either using Scipy or MATLAB
    # Create "walk" matrix (normalized adjacency matrix)
    print "* Creating PPR  matrix..."
    W = nx.to_numpy_matrix(G, nodelist=nodes, dtype=np.float64)
    W = np.asarray(W)
    W = W / W.sum(axis=1)  # normalization step

    ## Create PPR matrix using Python
    from scipy.linalg import inv
    PPR = args.beta * inv(sp.eye(n) - (1. - args.beta) * W)
    if args.format == 'hdf5':
        hnio.save_hdf5(pprfile, dict(PPR=PPR))
    elif args.format == 'npy':
        np.save(pprfile, PPR)
    elif args.format == 'matlab':
        scipy.io.savemat(pprfile, dict(PPR=PPR))
Beispiel #3
0
def run(args):
    # Load gene-index map
    with open(args.gene_index_file) as infile:
        arrs = [ l.rstrip().split() for l in infile ]
        indexToGene = dict((int(arr[0]), arr[1]) for arr in arrs)

    G = nx.Graph()
    G.add_nodes_from( indexToGene.values() ) # in case any nodes have degree zero

    # Load graph
    print "* Loading PPI..."
    with open(args.edgelist_file) as infile:
        edges = [ map(int, l.rstrip().split()[:2]) for l in infile ]
    G.add_edges_from( [(indexToGene[u], indexToGene[v]) for u,v in edges] )

    print "\t- Edges:", len(G.edges())
    print "\t- Nodes:", len(G.nodes())

    # Remove self-loops and restrict to largest connected component
    print "* Removing self-loops, multi-edges, and restricting to",
    print "largest connected component..."
    selfLoops = [(u, v) for u, v in G.edges() if u == v]
    G.remove_edges_from( selfLoops )
    G = G.subgraph( sorted(nx.connected_components( G ), key=lambda cc: len(cc),
                           reverse=True)[0] )
    nodes = sorted(G.nodes())
    n = len(nodes)
    print "\t- Largest CC Edges:", len( G.edges() )
    print "\t- Largest CC Nodes:", len( G.nodes() )

    # Set up output directory
    print "* Saving updated graph to file..."
    os.system( 'mkdir -p ' + args.output_dir )
    output_dir = os.path.normpath(os.getcwd() + "/" + args.output_dir)
    output_prefix = "{}/{}".format(output_dir, args.prefix)

    if args.format == 'hdf5': ext = 'h5'
    elif args.format == 'matlab': ext = 'mat'
    else: ext = args.format

    pprfile = "{}_ppr_{:g}.{}".format(output_prefix, args.beta, ext)

    # Index mapping for genes
    index_map = [ "{} {}".format(i+args.start_index, nodes[i]) for i in range(n) ]
    with open("{}_index_genes".format(output_prefix), 'w') as outfile:
        outfile.write( "\n".join(index_map) )

    # Edge list
    edges = [sorted([nodes.index(u) + args.start_index,
                     nodes.index(v) + args.start_index])
             for u, v in G.edges()]
    edgelist = [ "{} {} 1".format(u, v) for u, v in edges ]

    with open("{}_edge_list".format(output_prefix), 'w') as outfile:
        outfile.write( "\n".join(edgelist) )

    ## Create the PPR matrix either using Scipy or MATLAB
    # Create "walk" matrix (normalized adjacency matrix)
    print "* Creating PPR  matrix..."
    W = nx.to_numpy_matrix( G , nodelist=nodes, dtype=np.float64 )
    W = np.asarray(W)
    W = W / W.sum(axis=1) # normalization step

    ## Create PPR matrix using Python
    from scipy.linalg import inv
    PPR = args.beta*inv(sp.eye(n)-(1.-args.beta)*W)
    if args.format == 'hdf5':
        hnio.save_hdf5(pprfile, dict(PPR=PPR))
    elif args.format == 'npy':
        np.save(pprfile, PPR)
    elif args.format == 'matlab':
        scipy.io.savemat(pprfile, dict(PPR=PPR))