Exemple #1
0
def calc_weights(text, keyword):
    words, keyword = tokenize(text, keyword)
    if not keyword:
        return dict(itt.izip(words, itt.cycle((1.0,))))

    mst, root = create_mst(words, keyword)
    weights_vp = mst.new_vertex_property('double')
    weights_vp[root] = 1.0

    class Visitor(gt.DFSVisitor):
        def __init__(self, w_vp, w_ep):
            self.w_vp = w_vp
            self.w_ep = w_ep
        def examine_edge(self, e):
            s, t = e.source(), e.target()
            if not self.w_vp[t]:
                self.w_vp[t] = self.w_vp[s] * (1.0 - self.w_ep[e])
    gt.dfs_search(mst, root, Visitor(weights_vp, mst.ep['weights']))

    words_vp = mst.vp['words']
    weights = dict((words_vp[v], weights_vp[v]) for v in mst.vertices())
    min_w = min(itt.ifilter(lambda x: x > 0, weights.values())) ** 1.5
    for w in weights.keys():
        if weights[w] <= EPSILON:
            weights[w] = min_w
    return weights
def _filter(g):
    # higher taxa that should be removed (nodes collapsed), and their
    # immmediate children flagged incertae sedis and linked back to
    # the parent of collapsed node
    incertae_keywords = [
        'endophyte','scgc','libraries','samples','metagenome','unclassified',
        'other','unidentified','mitosporic','uncultured','incertae',
        'environmental']

    # taxa that are not clades, and should be removed (collapsed) -
    # children linked to parent of collapsed node
    collapse_keywords = ['basal ','stem ','early diverging ']

    # higher taxa that should be removed along with all of their children
    remove_keywords = ['viroids','virus','viruses','viral','artificial']

    print 'removing'
    rm = g.collapsed
    def f(x): rm[x] = 1
    T = Traverser(post=f)
    for v in ifilter(lambda x:x.out_degree(), g.vertices()):
        name = g.vertex_name[v].lower()
        s = name.split()
        for kw in remove_keywords:
            if kw in s:
                gt.dfs_search(g, v, T)
                break

        for kw in incertae_keywords:
            if kw in s:
                gt.dfs_search(g, v, T)
                ## rm[v] = 1
                ## for c in v.out_neighbours():
                ##     g.incertae_sedis[c] = 1
                break

        s = name.replace('-', ' ')
        for w in collapse_keywords:
            if s.startswith(w):
                rm[v] = 1
                break

    g.set_vertex_filter(rm, inverted=True)
    # assume root == vertex 0
    outer = [ v for v in g.vertices()
              if int(v) and v.in_degree()==0 ]
    g.set_vertex_filter(None)

    for v in outer:
        p = v.in_neighbours().next()
        while rm[p]:
            p = p.in_neighbours().next()
        g.edge_in_taxonomy[g.add_edge(p, v)] = 1
    print 'done'

    g.set_vertex_filter(rm, inverted=True)

    for v in g.vertices():
        if int(v): assert v.in_degree()==1
Exemple #3
0
 def _do_visit(self, graph_view):
     if self.order == "bfs":
         bfs_search(graph_view, visitor=self)
     elif self.order == "dfs":
         dfs_search(graph_view, visitor=self)
     else:
         raise ValueError("Invalid visit order %s" % self.order)
     return self.finalize(graph_view)
Exemple #4
0
def descendents(G,S):
	new_vert = G.add_vertex()
	for v in S:
		G.add_edge(new_vert,v)
	visited = G.new_vertex_property("int")
	gt.dfs_search(G, new_vert, RADSVisitor(visited))
	G.remove_vertex(new_vert)
	return set(np.nonzero(np.array(visited.a))[0])
Exemple #5
0
def prune_to_clade(g, v):
    """
    reduce taxonomy graph g to tid and its descendants

    important: g is assumed to be *unfiltered*
    (create_*_taxonomy_graph now returns an unfiltered graph)
    """
    tid = g.vertex_taxid[v]
    p = g.new_vertex_property('bool')
    class T(gt.DFSVisitor):
        def discover_vertex(self, u):
            p[u] = 1
    gt.dfs_search(g, v, T())
    g.set_vertex_filter(p)
    g.purge_vertices()
    g.purge_edges()
    g.clear_filters()
    index_graph(g, reindex=True)
    g.taxid_vertex = dict([ (g.vertex_taxid[x],x) for x in g.vertices() ])
    g.root = g.taxid_vertex[tid]
def _filter(g):
    # higher taxa that should be removed (nodes collapsed), and their
    # immmediate children flagged incertae sedis and linked back to
    # the parent of collapsed node
    incertae_keywords = [
        'endophyte', 'scgc', 'libraries', 'samples', 'metagenome',
        'unclassified', 'other', 'unidentified', 'mitosporic', 'uncultured',
        'incertae', 'environmental'
    ]

    # taxa that are not clades, and should be removed (collapsed) -
    # children linked to parent of collapsed node
    collapse_keywords = ['basal ', 'stem ', 'early diverging ']

    # higher taxa that should be removed along with all of their children
    remove_keywords = ['viroids', 'virus', 'viruses', 'viral', 'artificial']

    print 'removing'
    rm = g.collapsed

    def f(x):
        rm[x] = 1

    T = Traverser(post=f)
    for v in ifilter(lambda x: x.out_degree(), g.vertices()):
        name = g.vertex_name[v].lower()
        s = name.split()
        for kw in remove_keywords:
            if kw in s:
                gt.dfs_search(g, v, T)
                break

        for kw in incertae_keywords:
            if kw in s:
                gt.dfs_search(g, v, T)
                ## rm[v] = 1
                ## for c in v.out_neighbours():
                ##     g.incertae_sedis[c] = 1
                break

        s = name.replace('-', ' ')
        for w in collapse_keywords:
            if s.startswith(w):
                rm[v] = 1
                break

    g.set_vertex_filter(rm, inverted=True)
    # assume root == vertex 0
    outer = [v for v in g.vertices() if int(v) and v.in_degree() == 0]
    g.set_vertex_filter(None)

    for v in outer:
        p = v.in_neighbours().next()
        while rm[p]:
            p = p.in_neighbours().next()
        g.edge_in_taxonomy[g.add_edge(p, v)] = 1
    print 'done'

    g.set_vertex_filter(rm, inverted=True)

    for v in g.vertices():
        if int(v): assert v.in_degree() == 1
Exemple #7
0
    p = np.sum(mask)/mask.shape[0]
    vals = mask
else:
    vals = binomial(1,p,num_bonds).astype('bool')

prop = g.new_edge_property('bool',vals=vals)
g.set_edge_filter(prop)

num_vert = g.num_vertices()
g.add_vertex(2)
g.add_edge_list([[num_vert,n] for n in range(Lx*Ly)])
g.add_edge_list([[num_vert+1,n] for n in range(l-Lx*Ly,l)])

log.info("checking for percolation")
vcc_path = set()
gt.dfs_search(g,num_vert,visitor=MyVisitor(vcc_path))

if not num_vert+1 in vcc_path:
    log.warn("not percolating")
else:
    vcc_path.remove(num_vert+1)

vcc_path.remove(num_vert)


g.remove_vertex(num_vert+1)
g.remove_vertex(num_vert)

plane = [i for i in range(Lx*Ly,2*Lx*Ly+1) if i in g.vertex(i-Lx*Ly).all_neighbours() and i in vcc_path]

top = range(Lx*Ly)
def generate_graph():
    """
    brew tap homebrew/science
    brew install graph-tool
    """

    from graph_tool.all import price_network, sfdp_layout, graph_draw
    from graph_tool.all import dfs_search, DFSVisitor, seed_rng
    from numpy.random import seed

    class AnnotationVisitor(DFSVisitor):
        def __init__(self, pred, dist):
            self.pred = pred
            self.dist = dist
            self.roots = {}

        def tree_edge(self, e):
            depth = self.dist[e.source()]
            if depth == 1:
                genre = int(e.source())
                if genre not in self.roots:
                    self.roots[genre] = len(self.roots)
            else:
                genre = self.pred[e.source()]
            self.pred[e.target()] = genre
            self.dist[e.target()] = depth + 1

    # For run-to-run stability, provide a constant seed:
    seed(SEED)
    seed_rng(SEED)

    print 'Generating graph...'
    g = price_network(2000)

    print 'Performing layout...'
    pos = sfdp_layout(g)

    print 'Adding depths...'
    dist = g.new_vertex_property("int")
    pred = g.new_vertex_property("int64_t")
    g.set_directed(False)
    visitor = AnnotationVisitor(pred, dist)
    dfs_search(g, g.vertex(0), visitor)

    print 'Iterating over verts...'
    flattened = []
    maxp = [-9999, -9999]
    minp = [+9999, +9999]
    maxd = 0
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        minp[0] = min(minp[0], x)
        minp[1] = min(minp[1], y)
        maxp[0] = max(maxp[0], x)
        maxp[1] = max(maxp[1], y)
        maxd = max(maxd, dist.a[v])
        flattened += [x, y, z]

    print 'max depth is', maxd
    print 'nroots is', len(visitor.roots)
    print 'ncolors is', len(COLORS)

    extent = (maxp[0] - minp[0], maxp[1] - minp[1])
    padding = extent[0] * PADDING_FRACTION
    minp[0] -= padding
    minp[1] -= padding
    maxp[0] += padding
    maxp[1] += padding
    scale = [
        1.0 / (maxp[0] - minp[0]),
        1.0 / (maxp[1] - minp[1])]
    scale = min(scale[0], scale[1])
    midp = [
        0.5 * (maxp[0] + minp[0]),
        0.5 * (maxp[1] + minp[1])]
    flatarray = []
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        x = (0.5 + (x - midp[0]) * scale)
        y = (0.5 + (y - midp[1]) * scale)
        prom = int(dist.a[v])
        flatarray += [x, y, root, prom]
    return flatarray
Exemple #9
0
def dfs_tree(g):
  t = g.new_edge_property("bool")
  gt.dfs_search(g, g.vertex(0), TreeVisitor(t))
  return gt.GraphView(g, efilt=t)
Exemple #10
0
def _filter(g):
    # higher taxa that should be removed (nodes collapsed), and their
    # immmediate children flagged incertae sedis and linked back to
    # the parent of collapsed node
    incertae_keywords = [
        'endophyte','scgc','samples','metagenome',
        'unclassified', 'other','unidentified','mitosporic','uncultured',
        'incertae','environmental','other']

    # taxa that are not clades, and should be removed (collapsed) -
    # children linked to parent of collapsed node
    collapse_keywords = ['basal ','stem ','early diverging ']

    # higher taxa that should be removed along with all of their children
    remove_keywords = ['viroids','virus','viruses','viral','artificial',
                       'phage','plasmid','plasmids','vector','vectors',
                       'recombinant','synthetic','cloning','EST','mixed',
                       'library','libraries','transposons','midivariant',
                       'sequences','enrichment','miscellaneous']

    logging.info('removing vertices that are not real taxa (clades)')
    rm = g.collapsed
    def f(x): rm[x] = 1
    T = Traverser(post=f)
    for v in ifilter(lambda x:x.out_degree(), g.vertices()):
        name = g.vertex_name[v].lower()
        s = name.split()
        for kw in remove_keywords:
            if kw in s:
                print 'remove:', name
                gt.dfs_search(g, v, T)
                break

        for kw in incertae_keywords:
            if kw in s:
                print 'incertae sedis:', name
                rm[v] = 1
                for c in v.out_neighbours():
                    g.incertae_sedis[c] = 1
                break

        s = name.replace('-', ' ')
        for w in collapse_keywords:
            if s.startswith(w):
                print 'collapse:', name
                rm[v] = 1
                break

    p = g.vp.get('hidden')
    if p:
        for i in p.a.nonzero()[0]:
            rm[g.vertex(i)] = 1

    g.set_vertex_filter(rm, inverted=True)
    # assume root == vertex 0
    outer = [ v for v in g.vertices()
              if int(v) and v.in_degree()==0 ]
    g.set_vertex_filter(None)

    print 'connecting nodes orphaned from collapsing'
    while outer:
        v = outer.pop()
        print len(outer), '\r',
        p = v.in_neighbours().next()
        while rm[p]:
            q = p.in_neighbours().next()
            assert int(q) != int(p)
            p = q
        g.edge_in_taxonomy[g.add_edge(p, v)] = 1
    print 'done'

    g.set_vertex_filter(rm, inverted=True)
    g.purge_vertices()
    g.purge_edges()
    g.clear_filters()
    
    for v in g.vertices():
        if int(v): assert v.in_degree()==1
def generate_graph():
    """
    brew tap homebrew/science
    brew install graph-tool
    """

    from graph_tool.all import price_network, sfdp_layout, graph_draw
    from graph_tool.all import dfs_search, DFSVisitor, seed_rng
    from numpy.random import seed

    class AnnotationVisitor(DFSVisitor):
        def __init__(self, pred, dist):
            self.pred = pred
            self.dist = dist
            self.roots = {}

        def tree_edge(self, e):
            depth = self.dist[e.source()]
            if depth == 1:
                genre = int(e.source())
                if genre not in self.roots:
                    self.roots[genre] = len(self.roots)
            else:
                genre = self.pred[e.source()]
            self.pred[e.target()] = genre
            self.dist[e.target()] = depth + 1

    # For run-to-run stability, provide a constant seed:
    seed(SEED)
    seed_rng(SEED)

    print 'Generating graph...'
    g = price_network(2000)

    print 'Performing layout...'
    pos = sfdp_layout(g)

    print 'Adding depths...'
    dist = g.new_vertex_property("int")
    pred = g.new_vertex_property("int64_t")
    g.set_directed(False)
    visitor = AnnotationVisitor(pred, dist)
    dfs_search(g, g.vertex(0), visitor)

    print 'Iterating over verts...'
    flattened = []
    maxp = [-9999, -9999]
    minp = [+9999, +9999]
    maxd = 0
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        minp[0] = min(minp[0], x)
        minp[1] = min(minp[1], y)
        maxp[0] = max(maxp[0], x)
        maxp[1] = max(maxp[1], y)
        maxd = max(maxd, dist.a[v])
        flattened += [x, y, z]

    print 'max depth is', maxd
    print 'nroots is', len(visitor.roots)
    print 'ncolors is', len(COLORS)

    extent = (maxp[0] - minp[0], maxp[1] - minp[1])
    padding = extent[0] * PADDING_FRACTION
    minp[0] -= padding
    minp[1] -= padding
    maxp[0] += padding
    maxp[1] += padding
    scale = [1.0 / (maxp[0] - minp[0]), 1.0 / (maxp[1] - minp[1])]
    scale = min(scale[0], scale[1])
    midp = [0.5 * (maxp[0] + minp[0]), 0.5 * (maxp[1] + minp[1])]
    flatarray = []
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        x = (0.5 + (x - midp[0]) * scale)
        y = (0.5 + (y - midp[1]) * scale)
        prom = int(dist.a[v])
        flatarray += [x, y, root, prom]
    return flatarray
Exemple #12
0
def raf_components(G):
	comps = G.new_vertex_property("int")	
	gt.dfs_search(G, G.vertex(0), SCCVisitor(G,comps))
	return comps
Exemple #13
0
def graph_mis_dfs(G):
	mis = G.new_vertex_property("int")
	vis = MISVisitor(G,mis)
	gt.dfs_search(G, G.vertex(0), vis)
	print 'vis.mis_ancestor.a', vis.mis_ancestor.a
	return np.nonzero(np.array(mis.a))[0]