コード例 #1
0
    def traverse(taxv):
        """
        `taxv` is a vertex in the taxonomy graph. This function checks whether
        it is convex in `treegraph`; if yes, stores the info in
        `convex`; if no, it recursively checks descendants of `taxv` for
        convexity
        """
        tid = taxidsubg.vertex_taxid[taxv]
        print 'checking', tid, taxidsubg.vertex_name[taxv]
        p, c = color_vertices(g, treegraph, tid)
        if len(c)==1 and len(c[1])==1: # taxv/tid is convex
            print '...success'
            rv = c[1][0] # rv is the root of the convex subtree
            treegraph.set_vertex_filter(p)
            ## lvs = [ x for x in treegraph.vertices() if x.out_degree()==1 ]
            lvs = [ x for x in treegraph_leaves if p[x] ]
            if len(lvs) > 2:
                # we are only interested in convex subgraphs having
                # more than 2 leaves
                rootpaths = []
                for lf in lvs:
                    ti = treegraph.vertex_taxid[lf]
                    tv = taxidsubg.taxid_vertex[ti]
                    if not taxidsubg.incertae_sedis[tv]:
                        rootpaths.append(tg.taxid_rootpath(taxidsubg, ti))
                if rootpaths:
                    mrca = tg.rootpath_mrca(rootpaths)
                    print 'traverse: mrca', mrca
                    ancv = [taxidsubg.taxid_vertex[mrca]]
                    while ancv[-1] != taxv:
                        # STRANGE EDGE CASES HERE
                        try: ancv.append(ancv[-1].in_neighbours().next())
                        except StopIteration: pass

                    k = '.'.join([ str(taxidsubg.vertex_taxid[x])
                                   for x in ancv ])
                    convex[k] = (rv, p)
            treegraph.set_vertex_filter(None)
        else:
            treegraph.set_vertex_filter(None)
            for n in taxv.out_neighbours():
                traverse(n)
コード例 #2
0
ファイル: find-outliers.py プロジェクト: lcoghill/phyloboost
from ivy import treegraph as tg
from collections import Counter
from glob import glob
import numpy as np

ncbi_graph_file = 'ncbi.gt.gz' #taxonomy graph file
cluster_dir = '' #directory of vsearch produced clusters
cutoff = 3 # number of mean absolute devations from median

g = tg.load_taxonomy_graph(ncbi_graph_file)
for clustfile in glob(cluster_dir):
    headers = [ x[1:-1].split('_') for x in open(clustfile) if x[0]=='>' ]
    gi2ti = dict([ (int(a[2:]), int(b[2:])) for a,b in headers ])
    tis = sorted(set(gi2ti.values()))
    rootpaths = [ tg.taxid_rootpath(g, ti) for ti in tis ]
    ## print 'mrca:', g.taxid_name(tg.rootpath_mrca(rootpaths))
    counts = Counter()
    for rp in rootpaths:
        for ti in rp:
            counts[ti] += 1

    def f(rp):
        'steps to most recent common ancestor of any other ti in the cluster'
        for i, ti in enumerate(rp):
            if counts[ti]>1:
                break
        return i

    steps = [ f(rp) for rp in rootpaths ]
    median = np.median(steps)
    absdev = [ abs(x-median) for x in steps ]
コード例 #3
0
def proc(g, line, merged, probfile, outfile):
    pbtree, s = line.split()
    print 'processing', pbtree
    r = ivy.newick.parse(s) # the root node of the tree of interest
    lvs = r.leaves()
    rps = [] # rootpaths of leaf nodes, where each rootpath is a list
             # of taxids from leaf to root
    leaf_tid_counts = tg.Counter()
    try:
        for lf in lvs:
            # assign/compute attributes of leaves
            w = lf.label.split('_')
            lf.gi = int(w[-2][2:])
            lf.taxid = int(w[-1][2:])
            leaf_tid_counts[lf.taxid] += 1
            if lf.taxid not in g.taxid_vertex and lf.taxid in merged:
                lf.taxid = merged[lf.taxid]
            ## lf.taxv = g.taxid_vertex[lf.taxid]
            taxv = g.taxid_vertex[lf.taxid]
            lf.taxid_next, lf.taxid_back = g.hindex[taxv]
            lf.taxid_rootpath = tg.taxid_rootpath(g, lf.taxid)
            for i, x in enumerate(lf.taxid_rootpath):
                if x not in g.taxid_vertex and x in merged:
                    lf.taxid_rootpath[i] = merged[x]
            rps.append(lf.taxid_rootpath)
    except:
        print '!!! problem assigning leaf taxids'
        probfile.write('%s\n' % pbtree)
        #return []

    r.mrca = tg.rootpath_mrca(rps) # taxid of mrca of all tree's leaves

    taxids = set()
    for rp in rps:
        # trim rootpaths: make them terminate with mrca
        while 1:
            if rp[-1] == r.mrca: break
            else: rp.pop()
        assert rp
        taxids.update(rp)

    # create a taxonomy (sub)graph of only those taxids represented in r
    ## taxidsubg = tg.taxid_subgraph(g, taxids)
    taxidsubg = tg.taxid_new_subgraph(g, taxids)
    taxidsubg.vfilt = taxidsubg.new_vertex_property('bool')

    ## r.taxv = taxidsubg.taxid_vertex[r.mrca]

    # no need to check for convexity for singleton tip taxa
    for x in [ taxidsubg.taxid_vertex[lf.taxid] for lf in lvs
               if leaf_tid_counts[lf.taxid]==1 ]:
        taxidsubg.vfilt[x] = 0
    
    # an undirected graph having the same topology as r, used for
    # checking convexity of taxa
    treegraph = tg.gt.Graph(directed=False)
    treegraph.mrca = r.mrca
    print 'mrca:', g.taxid_name(r.mrca)
    treegraph.vertex_taxid = tg.get_or_create_vp(treegraph, 'taxid', 'int')
    ## treegraph.vertex_taxv = tg.get_or_create_vp(treegraph, 'taxv', 'int')
    v2lf = {}
    N = len(r)
    verts = treegraph.add_vertex(N)
    for n in r: # for each node in r
        # store its treegraph vertex
        n.v = verts.next()
        if not n.children:
            treegraph.vertex_taxid[n.v] = n.taxid
            ## treegraph.vertex_taxv[n.v] = int(n.taxv)
            v2lf[n.v] = n
        if n.parent:
            treegraph.add_edge(n.parent.v, n.v)

    treegraph_leaves = [ x for x in treegraph.vertices() if x.out_degree()==1 ]
    convex = {} # for storing the convex subgraphs
    def traverse(taxv):
        """
        `taxv` is a vertex in the taxonomy graph. This function checks whether
        it is convex in `treegraph`; if yes, stores the info in
        `convex`; if no, it recursively checks descendants of `taxv` for
        convexity
        """
        tid = taxidsubg.vertex_taxid[taxv]
        print 'checking', tid, taxidsubg.vertex_name[taxv]
        p, c = color_vertices(g, treegraph, tid)
        if len(c)==1 and len(c[1])==1: # taxv/tid is convex
            print '...success'
            rv = c[1][0] # rv is the root of the convex subtree
            treegraph.set_vertex_filter(p)
            ## lvs = [ x for x in treegraph.vertices() if x.out_degree()==1 ]
            lvs = [ x for x in treegraph_leaves if p[x] ]
            if len(lvs) > 2:
                # we are only interested in convex subgraphs having
                # more than 2 leaves
                rootpaths = []
                for lf in lvs:
                    ti = treegraph.vertex_taxid[lf]
                    tv = taxidsubg.taxid_vertex[ti]
                    if not taxidsubg.incertae_sedis[tv]:
                        rootpaths.append(tg.taxid_rootpath(taxidsubg, ti))
                if rootpaths:
                    mrca = tg.rootpath_mrca(rootpaths)
                    print 'traverse: mrca', mrca
                    ancv = [taxidsubg.taxid_vertex[mrca]]
                    while ancv[-1] != taxv:
                        # STRANGE EDGE CASES HERE
                        try: ancv.append(ancv[-1].in_neighbours().next())
                        except StopIteration: pass

                    k = '.'.join([ str(taxidsubg.vertex_taxid[x])
                                   for x in ancv ])
                    convex[k] = (rv, p)
            treegraph.set_vertex_filter(None)
        else:
            treegraph.set_vertex_filter(None)
            for n in taxv.out_neighbours():
                traverse(n)

    for v in taxidsubg.root.out_neighbours(): traverse(v)
    ## print 'done'

    def make_newick(root, seen):
        children = [ x for x in root.out_neighbours() if x not in seen ]
        if children:
            seen.update(children)
            s = '(%s)' % ','.join(
                [ make_newick(c, seen) for c in children ]
                )
        else:
            s = v2lf[root].label.replace(',','').replace('(','').replace(')','')
        return s
        
    newicks = []

    for k, (root, p) in convex.items():
        treegraph.set_vertex_filter(p)
        s = make_newick(root, set([root]))
        treegraph.set_vertex_filter(None)
        names = ','.join([ g.taxid_name(int(x)) for x in k.split('.') ])
        outfile.write('%s\t%s\t%s\t%s;\n' % (pbtree, k, names, s))
        print 'wrote subtree:', names

    for n in r.postiter():
        n.parent = None; del n.children
コード例 #4
0
        strees.append(r)
        i += 1

stree2color = {}
for i, r in enumerate(strees):
    stree2color[r.stree] = tg.color20[i % 20]

taxids = set()
for r in strees:
    tg.map_stree(g, r)
    for lf in r.leaves():
        taxids.update(lf.taxid_rootpath)

root_taxa = set([r.taxid for r in strees])
if len(root_taxa) > 1:
    rps = [tg.taxid_rootpath(g, x) for x in root_taxa]
    mrca_taxid = tg.rootpath_mrca(rps)
    for x in rps:
        taxids.update(x[: x.index(mrca_taxid) + 1])

taxg = tg.taxid_new_subgraph(g, taxids)

verts = taxg.new_vertex_property("bool")
edges = taxg.new_edge_property("bool")
for r in strees:
    tg.merge_stree(taxg, r, r.stree, verts, edges)

root = taxg.root

gv = tg.graph_view(taxg, vfilt=verts, efilt=edges)
コード例 #5
0
        strees.append(r)
        i += 1

stree2color = {}
for i, r in enumerate(strees):
    stree2color[r.stree] = tg.color20[i % 20]

taxids = set()
for r in strees:
    tg.map_stree(g, r)
    for lf in r.leaves():
        taxids.update(lf.taxid_rootpath)

root_taxa = set([r.taxid for r in strees])
if len(root_taxa) > 1:
    rps = [tg.taxid_rootpath(g, x) for x in root_taxa]
    mrca_taxid = tg.rootpath_mrca(rps)
    for x in rps:
        taxids.update(x[:x.index(mrca_taxid) + 1])

taxg = tg.taxid_new_subgraph(g, taxids)

verts = taxg.new_vertex_property('bool')
edges = taxg.new_edge_property('bool')
for r in strees:
    tg.merge_stree(taxg, r, r.stree, verts, edges)

root = taxg.root

gv = tg.graph_view(taxg, vfilt=verts, efilt=edges)