def calc_weights(text, keyword): words, keyword = tokenize(text, keyword) if not keyword: return dict(itt.izip(words, itt.cycle((1.0,)))) mst, root = create_mst(words, keyword) weights_vp = mst.new_vertex_property('double') weights_vp[root] = 1.0 class Visitor(gt.DFSVisitor): def __init__(self, w_vp, w_ep): self.w_vp = w_vp self.w_ep = w_ep def examine_edge(self, e): s, t = e.source(), e.target() if not self.w_vp[t]: self.w_vp[t] = self.w_vp[s] * (1.0 - self.w_ep[e]) gt.dfs_search(mst, root, Visitor(weights_vp, mst.ep['weights'])) words_vp = mst.vp['words'] weights = dict((words_vp[v], weights_vp[v]) for v in mst.vertices()) min_w = min(itt.ifilter(lambda x: x > 0, weights.values())) ** 1.5 for w in weights.keys(): if weights[w] <= EPSILON: weights[w] = min_w return weights
def _filter(g): # higher taxa that should be removed (nodes collapsed), and their # immmediate children flagged incertae sedis and linked back to # the parent of collapsed node incertae_keywords = [ 'endophyte','scgc','libraries','samples','metagenome','unclassified', 'other','unidentified','mitosporic','uncultured','incertae', 'environmental'] # taxa that are not clades, and should be removed (collapsed) - # children linked to parent of collapsed node collapse_keywords = ['basal ','stem ','early diverging '] # higher taxa that should be removed along with all of their children remove_keywords = ['viroids','virus','viruses','viral','artificial'] print 'removing' rm = g.collapsed def f(x): rm[x] = 1 T = Traverser(post=f) for v in ifilter(lambda x:x.out_degree(), g.vertices()): name = g.vertex_name[v].lower() s = name.split() for kw in remove_keywords: if kw in s: gt.dfs_search(g, v, T) break for kw in incertae_keywords: if kw in s: gt.dfs_search(g, v, T) ## rm[v] = 1 ## for c in v.out_neighbours(): ## g.incertae_sedis[c] = 1 break s = name.replace('-', ' ') for w in collapse_keywords: if s.startswith(w): rm[v] = 1 break g.set_vertex_filter(rm, inverted=True) # assume root == vertex 0 outer = [ v for v in g.vertices() if int(v) and v.in_degree()==0 ] g.set_vertex_filter(None) for v in outer: p = v.in_neighbours().next() while rm[p]: p = p.in_neighbours().next() g.edge_in_taxonomy[g.add_edge(p, v)] = 1 print 'done' g.set_vertex_filter(rm, inverted=True) for v in g.vertices(): if int(v): assert v.in_degree()==1
def _do_visit(self, graph_view): if self.order == "bfs": bfs_search(graph_view, visitor=self) elif self.order == "dfs": dfs_search(graph_view, visitor=self) else: raise ValueError("Invalid visit order %s" % self.order) return self.finalize(graph_view)
def descendents(G,S): new_vert = G.add_vertex() for v in S: G.add_edge(new_vert,v) visited = G.new_vertex_property("int") gt.dfs_search(G, new_vert, RADSVisitor(visited)) G.remove_vertex(new_vert) return set(np.nonzero(np.array(visited.a))[0])
def prune_to_clade(g, v): """ reduce taxonomy graph g to tid and its descendants important: g is assumed to be *unfiltered* (create_*_taxonomy_graph now returns an unfiltered graph) """ tid = g.vertex_taxid[v] p = g.new_vertex_property('bool') class T(gt.DFSVisitor): def discover_vertex(self, u): p[u] = 1 gt.dfs_search(g, v, T()) g.set_vertex_filter(p) g.purge_vertices() g.purge_edges() g.clear_filters() index_graph(g, reindex=True) g.taxid_vertex = dict([ (g.vertex_taxid[x],x) for x in g.vertices() ]) g.root = g.taxid_vertex[tid]
def _filter(g): # higher taxa that should be removed (nodes collapsed), and their # immmediate children flagged incertae sedis and linked back to # the parent of collapsed node incertae_keywords = [ 'endophyte', 'scgc', 'libraries', 'samples', 'metagenome', 'unclassified', 'other', 'unidentified', 'mitosporic', 'uncultured', 'incertae', 'environmental' ] # taxa that are not clades, and should be removed (collapsed) - # children linked to parent of collapsed node collapse_keywords = ['basal ', 'stem ', 'early diverging '] # higher taxa that should be removed along with all of their children remove_keywords = ['viroids', 'virus', 'viruses', 'viral', 'artificial'] print 'removing' rm = g.collapsed def f(x): rm[x] = 1 T = Traverser(post=f) for v in ifilter(lambda x: x.out_degree(), g.vertices()): name = g.vertex_name[v].lower() s = name.split() for kw in remove_keywords: if kw in s: gt.dfs_search(g, v, T) break for kw in incertae_keywords: if kw in s: gt.dfs_search(g, v, T) ## rm[v] = 1 ## for c in v.out_neighbours(): ## g.incertae_sedis[c] = 1 break s = name.replace('-', ' ') for w in collapse_keywords: if s.startswith(w): rm[v] = 1 break g.set_vertex_filter(rm, inverted=True) # assume root == vertex 0 outer = [v for v in g.vertices() if int(v) and v.in_degree() == 0] g.set_vertex_filter(None) for v in outer: p = v.in_neighbours().next() while rm[p]: p = p.in_neighbours().next() g.edge_in_taxonomy[g.add_edge(p, v)] = 1 print 'done' g.set_vertex_filter(rm, inverted=True) for v in g.vertices(): if int(v): assert v.in_degree() == 1
p = np.sum(mask)/mask.shape[0] vals = mask else: vals = binomial(1,p,num_bonds).astype('bool') prop = g.new_edge_property('bool',vals=vals) g.set_edge_filter(prop) num_vert = g.num_vertices() g.add_vertex(2) g.add_edge_list([[num_vert,n] for n in range(Lx*Ly)]) g.add_edge_list([[num_vert+1,n] for n in range(l-Lx*Ly,l)]) log.info("checking for percolation") vcc_path = set() gt.dfs_search(g,num_vert,visitor=MyVisitor(vcc_path)) if not num_vert+1 in vcc_path: log.warn("not percolating") else: vcc_path.remove(num_vert+1) vcc_path.remove(num_vert) g.remove_vertex(num_vert+1) g.remove_vertex(num_vert) plane = [i for i in range(Lx*Ly,2*Lx*Ly+1) if i in g.vertex(i-Lx*Ly).all_neighbours() and i in vcc_path] top = range(Lx*Ly)
def generate_graph(): """ brew tap homebrew/science brew install graph-tool """ from graph_tool.all import price_network, sfdp_layout, graph_draw from graph_tool.all import dfs_search, DFSVisitor, seed_rng from numpy.random import seed class AnnotationVisitor(DFSVisitor): def __init__(self, pred, dist): self.pred = pred self.dist = dist self.roots = {} def tree_edge(self, e): depth = self.dist[e.source()] if depth == 1: genre = int(e.source()) if genre not in self.roots: self.roots[genre] = len(self.roots) else: genre = self.pred[e.source()] self.pred[e.target()] = genre self.dist[e.target()] = depth + 1 # For run-to-run stability, provide a constant seed: seed(SEED) seed_rng(SEED) print 'Generating graph...' g = price_network(2000) print 'Performing layout...' pos = sfdp_layout(g) print 'Adding depths...' dist = g.new_vertex_property("int") pred = g.new_vertex_property("int64_t") g.set_directed(False) visitor = AnnotationVisitor(pred, dist) dfs_search(g, g.vertex(0), visitor) print 'Iterating over verts...' flattened = [] maxp = [-9999, -9999] minp = [+9999, +9999] maxd = 0 for v in g.vertices(): root_id = pred.a[v] if root_id not in visitor.roots: continue x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id] minp[0] = min(minp[0], x) minp[1] = min(minp[1], y) maxp[0] = max(maxp[0], x) maxp[1] = max(maxp[1], y) maxd = max(maxd, dist.a[v]) flattened += [x, y, z] print 'max depth is', maxd print 'nroots is', len(visitor.roots) print 'ncolors is', len(COLORS) extent = (maxp[0] - minp[0], maxp[1] - minp[1]) padding = extent[0] * PADDING_FRACTION minp[0] -= padding minp[1] -= padding maxp[0] += padding maxp[1] += padding scale = [ 1.0 / (maxp[0] - minp[0]), 1.0 / (maxp[1] - minp[1])] scale = min(scale[0], scale[1]) midp = [ 0.5 * (maxp[0] + minp[0]), 0.5 * (maxp[1] + minp[1])] flatarray = [] for v in g.vertices(): root_id = pred.a[v] if root_id not in visitor.roots: continue x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id] x = (0.5 + (x - midp[0]) * scale) y = (0.5 + (y - midp[1]) * scale) prom = int(dist.a[v]) flatarray += [x, y, root, prom] return flatarray
def dfs_tree(g): t = g.new_edge_property("bool") gt.dfs_search(g, g.vertex(0), TreeVisitor(t)) return gt.GraphView(g, efilt=t)
def _filter(g): # higher taxa that should be removed (nodes collapsed), and their # immmediate children flagged incertae sedis and linked back to # the parent of collapsed node incertae_keywords = [ 'endophyte','scgc','samples','metagenome', 'unclassified', 'other','unidentified','mitosporic','uncultured', 'incertae','environmental','other'] # taxa that are not clades, and should be removed (collapsed) - # children linked to parent of collapsed node collapse_keywords = ['basal ','stem ','early diverging '] # higher taxa that should be removed along with all of their children remove_keywords = ['viroids','virus','viruses','viral','artificial', 'phage','plasmid','plasmids','vector','vectors', 'recombinant','synthetic','cloning','EST','mixed', 'library','libraries','transposons','midivariant', 'sequences','enrichment','miscellaneous'] logging.info('removing vertices that are not real taxa (clades)') rm = g.collapsed def f(x): rm[x] = 1 T = Traverser(post=f) for v in ifilter(lambda x:x.out_degree(), g.vertices()): name = g.vertex_name[v].lower() s = name.split() for kw in remove_keywords: if kw in s: print 'remove:', name gt.dfs_search(g, v, T) break for kw in incertae_keywords: if kw in s: print 'incertae sedis:', name rm[v] = 1 for c in v.out_neighbours(): g.incertae_sedis[c] = 1 break s = name.replace('-', ' ') for w in collapse_keywords: if s.startswith(w): print 'collapse:', name rm[v] = 1 break p = g.vp.get('hidden') if p: for i in p.a.nonzero()[0]: rm[g.vertex(i)] = 1 g.set_vertex_filter(rm, inverted=True) # assume root == vertex 0 outer = [ v for v in g.vertices() if int(v) and v.in_degree()==0 ] g.set_vertex_filter(None) print 'connecting nodes orphaned from collapsing' while outer: v = outer.pop() print len(outer), '\r', p = v.in_neighbours().next() while rm[p]: q = p.in_neighbours().next() assert int(q) != int(p) p = q g.edge_in_taxonomy[g.add_edge(p, v)] = 1 print 'done' g.set_vertex_filter(rm, inverted=True) g.purge_vertices() g.purge_edges() g.clear_filters() for v in g.vertices(): if int(v): assert v.in_degree()==1
def generate_graph(): """ brew tap homebrew/science brew install graph-tool """ from graph_tool.all import price_network, sfdp_layout, graph_draw from graph_tool.all import dfs_search, DFSVisitor, seed_rng from numpy.random import seed class AnnotationVisitor(DFSVisitor): def __init__(self, pred, dist): self.pred = pred self.dist = dist self.roots = {} def tree_edge(self, e): depth = self.dist[e.source()] if depth == 1: genre = int(e.source()) if genre not in self.roots: self.roots[genre] = len(self.roots) else: genre = self.pred[e.source()] self.pred[e.target()] = genre self.dist[e.target()] = depth + 1 # For run-to-run stability, provide a constant seed: seed(SEED) seed_rng(SEED) print 'Generating graph...' g = price_network(2000) print 'Performing layout...' pos = sfdp_layout(g) print 'Adding depths...' dist = g.new_vertex_property("int") pred = g.new_vertex_property("int64_t") g.set_directed(False) visitor = AnnotationVisitor(pred, dist) dfs_search(g, g.vertex(0), visitor) print 'Iterating over verts...' flattened = [] maxp = [-9999, -9999] minp = [+9999, +9999] maxd = 0 for v in g.vertices(): root_id = pred.a[v] if root_id not in visitor.roots: continue x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id] minp[0] = min(minp[0], x) minp[1] = min(minp[1], y) maxp[0] = max(maxp[0], x) maxp[1] = max(maxp[1], y) maxd = max(maxd, dist.a[v]) flattened += [x, y, z] print 'max depth is', maxd print 'nroots is', len(visitor.roots) print 'ncolors is', len(COLORS) extent = (maxp[0] - minp[0], maxp[1] - minp[1]) padding = extent[0] * PADDING_FRACTION minp[0] -= padding minp[1] -= padding maxp[0] += padding maxp[1] += padding scale = [1.0 / (maxp[0] - minp[0]), 1.0 / (maxp[1] - minp[1])] scale = min(scale[0], scale[1]) midp = [0.5 * (maxp[0] + minp[0]), 0.5 * (maxp[1] + minp[1])] flatarray = [] for v in g.vertices(): root_id = pred.a[v] if root_id not in visitor.roots: continue x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id] x = (0.5 + (x - midp[0]) * scale) y = (0.5 + (y - midp[1]) * scale) prom = int(dist.a[v]) flatarray += [x, y, root, prom] return flatarray
def raf_components(G): comps = G.new_vertex_property("int") gt.dfs_search(G, G.vertex(0), SCCVisitor(G,comps)) return comps
def graph_mis_dfs(G): mis = G.new_vertex_property("int") vis = MISVisitor(G,mis) gt.dfs_search(G, G.vertex(0), vis) print 'vis.mis_ancestor.a', vis.mis_ancestor.a return np.nonzero(np.array(mis.a))[0]