Beispiel #1
0
def read_graph(bestedges, maxerr=100, directed=False):
    logging.debug("Max error = {0}%".format(maxerr))
    tag = "dir." if directed else ""
    bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format(
        maxerr, tag)
    if need_update(bestedges, bestgraph):
        G = {} if directed else nx.Graph()
        fp = open(bestedges)
        best_store = {}
        for row in fp:
            if row[0] == "#":
                continue
            id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split()
            id1, best5, best3 = int(id1), int(best5), int(best3)
            j1, j2 = float(j1), float(j2)
            if j1 <= maxerr or j2 <= maxerr:
                if not directed:
                    G.add_node(id1)
                id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1)
                best5o5 = "{0}-{1}".format(best5, o5)
                best3o3 = "{0}-{1}".format(best3, o3)
                best_store[id1p5] = best5o5
                best_store[id1p3] = best3o3
            if best5 and j1 <= maxerr:
                if directed:
                    G[id1p5] = best5o5
                else:
                    G.add_edge(best5, id1, weight=10)
            if best3 and j2 <= maxerr:
                if directed:
                    G[id1p3] = best3o3
                else:
                    G.add_edge(id1, best3, weight=10)

        # Annotate edge weight for mutual best link, note that edge weights are
        # (11) set close to 10, to minimize impact to layout (Yifan Hu's
        # multilevel)
        nmutuals = 0
        for k, v in best_store.items():
            if best_store.get(v) == k and k < v:
                k, v = int(k.split("-")[0]), int(v.split("-")[0])
                G[k][v]["weight"] = 11
                nmutuals += 1
        logging.debug("Mutual best edges: {0}".format(nmutuals))

        if directed:
            fw = open(bestgraph, "w")
            dump(G, fw)
            fw.close()
        else:
            nx.write_gpickle(G, bestgraph)
        logging.debug("Graph pickled to `{0}`".format(bestgraph))

        # Compute node degree histogram and save in (degree, counts) tab file
        degrees = G.degree()
        degree_counter = Counter(degrees.values())
        degreesfile = "degrees.txt"
        fw = open(degreesfile, "w")
        for degree, count in sorted(degree_counter.items()):
            print("{0}\t{1}".format(degree, count), file=fw)
        fw.close()
        logging.debug(
            "Node degree distribution saved to `{0}`".format(degreesfile))

        # Save high degree (top 1%) nodes in save in (node, degree) tab file
        percentile = sorted(degrees.values(),
                            reverse=True)[len(degrees) / 1000]
        logging.debug("Top 0.1% has degree of at least {0}".format(percentile))
        hubs = [(k, v) for k, v in degrees.items() if v >= percentile]
        hubs.sort(key=lambda x: x[1], reverse=True)  # degress descending
        hubsfile = "hubs.txt"
        fw = open(hubsfile, "w")
        for node, degree in hubs:
            print("{0}\t{1}".format(node, degree), file=fw)
        fw.close()
        logging.debug("Hubs saved to `{0}`".format(hubsfile))

    logging.debug("Read graph from `{0}`".format(bestgraph))
    if directed:
        G = load(open(bestgraph))
    else:
        G = nx.read_gpickle(bestgraph)
        graph_stats(G)
    return G
Beispiel #2
0
def read_graph(bestedges, maxerr=100, directed=False):
    logging.debug("Max error = {0}%".format(maxerr))
    tag = "dir." if directed else ""
    bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format(maxerr, tag)
    if need_update(bestedges, bestgraph):
        G = {} if directed else nx.Graph()
        fp = open(bestedges)
        best_store = {}
        for row in fp:
            if row[0] == "#":
                continue
            id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split()
            id1, best5, best3 = int(id1), int(best5), int(best3)
            j1, j2 = float(j1), float(j2)
            if j1 <= maxerr or j2 <= maxerr:
                if not directed:
                    G.add_node(id1)
                id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1)
                best5o5 = "{0}-{1}".format(best5, o5)
                best3o3 = "{0}-{1}".format(best3, o3)
                best_store[id1p5] = best5o5
                best_store[id1p3] = best3o3
            if best5 and j1 <= maxerr:
                if directed:
                    G[id1p5] = best5o5
                else:
                    G.add_edge(best5, id1, weight=10)
            if best3 and j2 <= maxerr:
                if directed:
                    G[id1p3] = best3o3
                else:
                    G.add_edge(id1, best3, weight=10)

        # Annotate edge weight for mutual best link, note that edge weights are
        # (11) set close to 10, to minimize impact to layout (Yifan Hu's
        # multilevel)
        nmutuals = 0
        for k, v in best_store.items():
            if best_store.get(v) == k and k < v:
                k, v = int(k.split("-")[0]), int(v.split("-")[0])
                G[k][v]["weight"] = 11
                nmutuals += 1
        logging.debug("Mutual best edges: {0}".format(nmutuals))

        if directed:
            fw = open(bestgraph, "w")
            cPickle.dump(G, fw)
            fw.close()
        else:
            nx.write_gpickle(G, bestgraph)
        logging.debug("Graph pickled to `{0}`".format(bestgraph))

        # Compute node degree histogram and save in (degree, counts) tab file
        degrees = G.degree()
        degree_counter = Counter(degrees.values())
        degreesfile = "degrees.txt"
        fw = open(degreesfile, "w")
        for degree, count in sorted(degree_counter.items()):
            print >> fw, "{0}\t{1}".format(degree, count)
        fw.close()
        logging.debug("Node degree distribution saved to `{0}`".format(degreesfile))

        # Save high degree (top 1%) nodes in save in (node, degree) tab file
        percentile = sorted(degrees.values(), reverse=True)[len(degrees) / 1000]
        logging.debug("Top 0.1% has degree of at least {0}".format(percentile))
        hubs = [(k, v) for k, v in degrees.items() if v >= percentile]
        hubs.sort(key=lambda x: x[1], reverse=True)  # degress descending
        hubsfile = "hubs.txt"
        fw = open(hubsfile, "w")
        for node, degree in hubs:
            print >> fw, "{0}\t{1}".format(node, degree)
        fw.close()
        logging.debug("Hubs saved to `{0}`".format(hubsfile))

    logging.debug("Read graph from `{0}`".format(bestgraph))
    if directed:
        G = cPickle.load(open(bestgraph))
    else:
        G = nx.read_gpickle(bestgraph)
        graph_stats(G)
    return G
Beispiel #3
0
def graph(args):
    """
    %prog graph best.edges

    Convert Celera Assembler's "best.edges" to a GEXF which can be used to
    feed into Gephi to check the topology of the best overlapping graph.

    Reference:
    https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py
    """
    import networkx as nx
    from jcvi.algorithms.graph import graph_stats, graph_local_neighborhood

    p = OptionParser(graph.__doc__)
    p.add_option("--maxerr", default=100, type="int", help="Maximum error rate")
    p.add_option("--query", default=-1, type="int", help="Search from node")
    p.add_option("--largest", default=1, type="int", help="Only show largest components")
    p.add_option("--maxsize", default=100, type="int", help="Max graph size")
    p.add_option("--contigs", help="Annotate graph with contig membership, "
                    " typically from `asm.posmap.frgctg`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    maxerr = opts.maxerr
    query = opts.query
    largest = opts.largest
    logging.debug("Max error = {0}%".format(maxerr))
    bestgraph = bestedges.split(".")[0] + ".err{0}.graph".format(maxerr)
    if need_update(bestedges, bestgraph):
        G = nx.Graph()
        fp = open(bestedges)
        for row in fp:
            if row[0] == '#':
                continue
            id1, lib_id, best5, o1, best3, o3, j1, j2 = row.split()
            id1, best5, best3 = int(id1), int(best5), int(best3)
            j1, j2 = float(j1), float(j2)
            if j1 < maxerr or j2 < maxerr:
                G.add_node(id1)
            if best5 != '0' and j1 < maxerr:
                G.add_edge(best5, id1)
            if best3 != '0' and j2 < maxerr:
                G.add_edge(id1, best3)
        nx.write_gpickle(G, bestgraph)
        logging.debug("Graph pickled to `{0}`".format(bestgraph))

    logging.debug("Read graph from `{0}`".format(bestgraph))
    G = nx.read_gpickle(bestgraph)
    graph_stats(G)

    if len(G) > 10000:
        SG = nx.Graph()
        H = graph_local_neighborhood(G, query=query,
                                     maxsize=opts.maxsize)
        SG.add_edges_from(H.edges())
        G = SG

    if largest > 1:  # only works for un-directed graph
        H = nx.connected_component_subgraphs(G)
        c = min(len(H), largest)
        logging.debug("{0} components found, {1} retained".format(len(H), c))

        G = nx.Graph()
        for x in H[:c]:
            G.add_edges_from(x.edges())

    if opts.contigs:
        reads_to_ctgs = parse_ctgs(bestedges, opts.contigs)
        annotate_contigs(G, reads_to_ctgs)

    gexf = "best"
    if query >= 0:
        gexf += ".{0}".format(query)
    gexf += ".gexf"
    nx.write_gexf(G, gexf)
    logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\
                    format(gexf, len(G), G.size()))