def read_graph(bestedges, maxerr=100, directed=False): logging.debug("Max error = {0}%".format(maxerr)) tag = "dir." if directed else "" bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format( maxerr, tag) if need_update(bestedges, bestgraph): G = {} if directed else nx.Graph() fp = open(bestedges) best_store = {} for row in fp: if row[0] == "#": continue id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split() id1, best5, best3 = int(id1), int(best5), int(best3) j1, j2 = float(j1), float(j2) if j1 <= maxerr or j2 <= maxerr: if not directed: G.add_node(id1) id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1) best5o5 = "{0}-{1}".format(best5, o5) best3o3 = "{0}-{1}".format(best3, o3) best_store[id1p5] = best5o5 best_store[id1p3] = best3o3 if best5 and j1 <= maxerr: if directed: G[id1p5] = best5o5 else: G.add_edge(best5, id1, weight=10) if best3 and j2 <= maxerr: if directed: G[id1p3] = best3o3 else: G.add_edge(id1, best3, weight=10) # Annotate edge weight for mutual best link, note that edge weights are # (11) set close to 10, to minimize impact to layout (Yifan Hu's # multilevel) nmutuals = 0 for k, v in best_store.items(): if best_store.get(v) == k and k < v: k, v = int(k.split("-")[0]), int(v.split("-")[0]) G[k][v]["weight"] = 11 nmutuals += 1 logging.debug("Mutual best edges: {0}".format(nmutuals)) if directed: fw = open(bestgraph, "w") dump(G, fw) fw.close() else: nx.write_gpickle(G, bestgraph) logging.debug("Graph pickled to `{0}`".format(bestgraph)) # Compute node degree histogram and save in (degree, counts) tab file degrees = G.degree() degree_counter = Counter(degrees.values()) degreesfile = "degrees.txt" fw = open(degreesfile, "w") for degree, count in sorted(degree_counter.items()): print("{0}\t{1}".format(degree, count), file=fw) fw.close() logging.debug( "Node degree distribution saved to `{0}`".format(degreesfile)) # Save high degree (top 1%) nodes in save in (node, degree) tab file percentile = sorted(degrees.values(), reverse=True)[len(degrees) / 1000] logging.debug("Top 0.1% has degree of at least {0}".format(percentile)) hubs = [(k, v) for k, v in degrees.items() if v >= percentile] hubs.sort(key=lambda x: x[1], reverse=True) # degress descending hubsfile = "hubs.txt" fw = open(hubsfile, "w") for node, degree in hubs: print("{0}\t{1}".format(node, degree), file=fw) fw.close() logging.debug("Hubs saved to `{0}`".format(hubsfile)) logging.debug("Read graph from `{0}`".format(bestgraph)) if directed: G = load(open(bestgraph)) else: G = nx.read_gpickle(bestgraph) graph_stats(G) return G
def read_graph(bestedges, maxerr=100, directed=False): logging.debug("Max error = {0}%".format(maxerr)) tag = "dir." if directed else "" bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format(maxerr, tag) if need_update(bestedges, bestgraph): G = {} if directed else nx.Graph() fp = open(bestedges) best_store = {} for row in fp: if row[0] == "#": continue id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split() id1, best5, best3 = int(id1), int(best5), int(best3) j1, j2 = float(j1), float(j2) if j1 <= maxerr or j2 <= maxerr: if not directed: G.add_node(id1) id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1) best5o5 = "{0}-{1}".format(best5, o5) best3o3 = "{0}-{1}".format(best3, o3) best_store[id1p5] = best5o5 best_store[id1p3] = best3o3 if best5 and j1 <= maxerr: if directed: G[id1p5] = best5o5 else: G.add_edge(best5, id1, weight=10) if best3 and j2 <= maxerr: if directed: G[id1p3] = best3o3 else: G.add_edge(id1, best3, weight=10) # Annotate edge weight for mutual best link, note that edge weights are # (11) set close to 10, to minimize impact to layout (Yifan Hu's # multilevel) nmutuals = 0 for k, v in best_store.items(): if best_store.get(v) == k and k < v: k, v = int(k.split("-")[0]), int(v.split("-")[0]) G[k][v]["weight"] = 11 nmutuals += 1 logging.debug("Mutual best edges: {0}".format(nmutuals)) if directed: fw = open(bestgraph, "w") cPickle.dump(G, fw) fw.close() else: nx.write_gpickle(G, bestgraph) logging.debug("Graph pickled to `{0}`".format(bestgraph)) # Compute node degree histogram and save in (degree, counts) tab file degrees = G.degree() degree_counter = Counter(degrees.values()) degreesfile = "degrees.txt" fw = open(degreesfile, "w") for degree, count in sorted(degree_counter.items()): print >> fw, "{0}\t{1}".format(degree, count) fw.close() logging.debug("Node degree distribution saved to `{0}`".format(degreesfile)) # Save high degree (top 1%) nodes in save in (node, degree) tab file percentile = sorted(degrees.values(), reverse=True)[len(degrees) / 1000] logging.debug("Top 0.1% has degree of at least {0}".format(percentile)) hubs = [(k, v) for k, v in degrees.items() if v >= percentile] hubs.sort(key=lambda x: x[1], reverse=True) # degress descending hubsfile = "hubs.txt" fw = open(hubsfile, "w") for node, degree in hubs: print >> fw, "{0}\t{1}".format(node, degree) fw.close() logging.debug("Hubs saved to `{0}`".format(hubsfile)) logging.debug("Read graph from `{0}`".format(bestgraph)) if directed: G = cPickle.load(open(bestgraph)) else: G = nx.read_gpickle(bestgraph) graph_stats(G) return G
def graph(args): """ %prog graph best.edges Convert Celera Assembler's "best.edges" to a GEXF which can be used to feed into Gephi to check the topology of the best overlapping graph. Reference: https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py """ import networkx as nx from jcvi.algorithms.graph import graph_stats, graph_local_neighborhood p = OptionParser(graph.__doc__) p.add_option("--maxerr", default=100, type="int", help="Maximum error rate") p.add_option("--query", default=-1, type="int", help="Search from node") p.add_option("--largest", default=1, type="int", help="Only show largest components") p.add_option("--maxsize", default=100, type="int", help="Max graph size") p.add_option("--contigs", help="Annotate graph with contig membership, " " typically from `asm.posmap.frgctg`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args maxerr = opts.maxerr query = opts.query largest = opts.largest logging.debug("Max error = {0}%".format(maxerr)) bestgraph = bestedges.split(".")[0] + ".err{0}.graph".format(maxerr) if need_update(bestedges, bestgraph): G = nx.Graph() fp = open(bestedges) for row in fp: if row[0] == '#': continue id1, lib_id, best5, o1, best3, o3, j1, j2 = row.split() id1, best5, best3 = int(id1), int(best5), int(best3) j1, j2 = float(j1), float(j2) if j1 < maxerr or j2 < maxerr: G.add_node(id1) if best5 != '0' and j1 < maxerr: G.add_edge(best5, id1) if best3 != '0' and j2 < maxerr: G.add_edge(id1, best3) nx.write_gpickle(G, bestgraph) logging.debug("Graph pickled to `{0}`".format(bestgraph)) logging.debug("Read graph from `{0}`".format(bestgraph)) G = nx.read_gpickle(bestgraph) graph_stats(G) if len(G) > 10000: SG = nx.Graph() H = graph_local_neighborhood(G, query=query, maxsize=opts.maxsize) SG.add_edges_from(H.edges()) G = SG if largest > 1: # only works for un-directed graph H = nx.connected_component_subgraphs(G) c = min(len(H), largest) logging.debug("{0} components found, {1} retained".format(len(H), c)) G = nx.Graph() for x in H[:c]: G.add_edges_from(x.edges()) if opts.contigs: reads_to_ctgs = parse_ctgs(bestedges, opts.contigs) annotate_contigs(G, reads_to_ctgs) gexf = "best" if query >= 0: gexf += ".{0}".format(query) gexf += ".gexf" nx.write_gexf(G, gexf) logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\ format(gexf, len(G), G.size()))