Example #1
0
def searchRegraft(conf, distmat, labels, stree, gene2species, params,
                  initTree=None, visited=None, proposeFunc=proposeTree2):
    if visited == None:
        visited = {}
    
    # init with NJ    
    if initTree != None:
        tree = initTree
    else:
        tree = phylo.neighborjoin(distmat, labels)
        tree = phylo.reconRoot(tree, stree, gene2species)
        Spidir.setTreeDistances(conf, tree, distmat, labels)

    # init likelihood score
    logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)
    
    # store tree in visited
    addVisited(conf, visited, tree, gene2species)
    
    # show initial tree
    printMCMC(conf, 0, tree, stree, gene2species, visited)    
    
    
    for i in xrange(conf["regrafts"]):
        tree = proposeTree3(conf, tree,  distmat, labels, 
                               stree, gene2species, params, visited)
        if tree.data["logl"] > logl:
            printMCMC(conf, i, tree, stree, gene2species, visited)
            logl = tree.data["logl"]
    
    return tree, tree.data["logl"]
Example #2
0
 def propose(chain, tree):
     tree2 = proposeFunc(conf, tree,  distmat, labels, 
                         stree, gene2species, params, visited)
     
     # check visited dict
     thash = phylo.hash_tree(tree2)
     if thash in visited:
         logl, tree2, count = visited[thash]
         #this.nold += 1
     else:
         Spidir.setTreeDistances(conf, tree2, distmat, labels)
         logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params)
         this.nold = 0
     
     addVisited(conf, visited, tree2, gene2species, thash)
     
     # best yet tree
     if logl > this.toplogl:
         printMCMC(conf, "%d:%d" % (chain.name, this.iter), 
                   tree2, stree, gene2species, visited)
         this.toplogl = logl
         this.toptree = tree2.copy()
         
         # move some other chains to best state
         #chains2 = sorted(chains, key=lambda x: x.logl)
         #for chain in chains2[:1]:
         #    chain.state = this.toptree.copy()
         #    chain.logl = this.toplogl
     
     # alter logl to influence search only
     #chain.relax = conf["speedup"] * this.nold
            
     return tree2, logl
Example #3
0
def getProposals(conf, tree, distmat, labels, stree, 
                 gene2species, params, visited, stuck=False):
                 
    # TODO: handle root edges
    
    # try all NNI
    # find edges for NNI
    nodes = tree.nodes.values()
    nodes = filter(lambda x: not x.isLeaf() and 
                             x != tree.root and
                             x not in tree.root.children, nodes)
    edges = [(node, node.parent) for node in nodes]
    edges.append(tuple(tree.root.children))
    
    
    treelib.drawTreeNames(tree, minlen=5, maxlen=5, out=sys.stderr)
    util.printcols(util.map2(lambda x: x.name, edges), out=sys.stderr)
    
    proposals = []
    for edge in edges:
        for change in (0,1):
            proposeNni(tree, edge[0], edge[1], change)
            tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True)
            
            thash = phylo.hash_tree(tree2)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                         gene2species, params)
                visited[thash] = [logl, tree2, 1]
                
                proposals.append([logl, edge, change])
            else:
                visited[thash][2] += 1
                logl = visited[thash][0]
                
                if not stuck:
                    proposals.append([logl, edge, change])
            
            
            
            # switch branch back
            proposeNni(tree, edge[0], edge[1], change)
    
    proposals.sort(key=lambda x: x[0], reverse=True)
    return proposals
Example #4
0
#!/usr/bin/env python

from rasmus.common import *
from rasmus.bio import phylo
import Spidir
import Spidir.Likelihood

tree = readTree("../test/0.nt.tree")
stree = readTree("../test/flies.stree")
gene2species = genomeutil.readGene2species("../test/flies.smap")
params = Spidir.readParams("../test/flies.nt.param")

drawTree(tree)

print sum(x.dist for x in tree)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.Likelihood.getBaserate(tree, stree, params, gene2species=gene2species)

conf = {"python_only": True, 
        "famprob": True}
print Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)

conf = {}
#generate = Spidir.estGeneRate(tree, stree, params, gene2species)

for generate in frange(1.5, 2.3, .05):
    print generate, Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params, 
                                             baserate=generate)
Example #5
0
def searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params,
                     depth=2, visited=None, visited2=None, topDepth=True,
                     toplogl=None, short=False):
    if visited == None:
        visited = {}
    if visited2 == None:
        visited2 = {}
    
    tree = tree.copy()
    
    # find initial logl
    thash = phylo.hash_tree(tree)
    if thash not in visited:
        Spidir.setTreeDistances(conf, tree, distmat, labels)
        logl = Spidir.treeLogLikelihood(conf, tree, stree, 
                                    gene2species, params)
        visited[thash] = [logl, tree.copy(), 1]
        
        drawTreeLogl(tree)
    else:
        logl = visited[thash][0]
        
    if toplogl == None:
        toplogl = [logl]
    
    
    debug(" " * (depth*2), "(%d)" % len(visited))
    sys.stdout.flush()
    
    if depth < 1:
        return tree, logl
    
    
    # try all NNI
    # find edges for NNI
    nodes = tree.nodes.values()
    nodes = filter(lambda x: not x.isLeaf() and 
                             x != tree.root and \
                             x.parent != tree.root, nodes)
    edges = [(node, node.parent) for node in nodes]
    
    for edge in edges:
        for change in (0,1):
            proposeNni(tree, edge[0], edge[1], change)
            
            thash = phylo.hash_tree(tree)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree, distmat, labels)
                logl = Spidir.treeLogLikelihood(conf, tree, stree, 
                                         gene2species, params)
                visited[thash] = [logl, tree.copy(), 1]
            else:
                logl = visited[thash][0]
            
            if logl > toplogl[0]:
                toplogl[0] = logl
                
                if short:
                    return tree, logl
                else:
                    printMCMC(conf, "N/A", tree, stree, gene2species, visited)
                
            
            if (thash not in visited2 or \
                depth > visited2[thash]) and \
                logl - toplogl[0] >= conf["eprune"]:
                visited2[thash] = depth
                
                # dig deeper
                if depth > 1:
                    tree2, logl2 = searchExhaustive(conf, distmat, labels, 
                                     tree, stree, gene2species, params,
                                     depth=depth-1, visited=visited,
                                     visited2=visited2,
                                     topDepth=False,
                                     toplogl=toplogl, short=short)
                    
                    if short and tree2 != None:
                        return tree2, logl2
                    
            
            # switch branch back
            proposeNni(tree, edge[0], edge[1], change)
    
    # debug
    if topDepth:
        items = visited.items()
        i = util.argmaxfunc(lambda x: x[1][0], items)
        
        thash, (logl, tree, count) = items[i]
        
        return tree, logl
    else:
        return None, None
Example #6
0
def searchGreedy(conf, distmat, labels, stree, gene2species, params, visited=None):
    if visited == None:
        visited = {}

    totalgenes = len(labels)
    ngenes = 2
    
    # create initial 2 gene tree (labels[0], labels[1])
    tree = treelib.Tree()
    tree.make_root()
    tree.add_child(tree.root, treelib.TreeNode(labels[0]))
    tree.add_child(tree.root, treelib.TreeNode(labels[1]))
    
    
    for ngenes in xrange(2, totalgenes):
        debug("adding", labels[ngenes])
        
        toplogl = -util.INF
        toptree = None
        
        distmat2 = matrixlib.submatrix(distmat, range(ngenes+1), range(ngenes+1))
        labels2  = labels[:ngenes+1]
        
        
        # place new gene on every branch
        for name in tree.nodes:
            tree2 = tree.copy()
            node = tree2.nodes[name]

            if node == tree2.root:
                newnode = treelib.TreeNode(tree2.new_name())
                tree2.add_child(newnode, tree2.root)
                tree2.root = newnode
                tree2.add_child(newnode, treelib.TreeNode(labels[ngenes]))
            else:
                parent = node.parent
                tree2.remove(node)
                newnode = treelib.TreeNode(tree2.new_name())
                tree2.add_child(parent, newnode)
                tree2.add_child(newnode, node)
                tree2.add_child(newnode, treelib.TreeNode(labels[ngenes]))
            
            #tree2 = phylo.reconRoot(tree2, stree, gene2species)
            Spidir.setTreeDistances(conf, tree2, distmat2, labels2)
            logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params)

            if logl >= toplogl:
                toplogl = logl
                toptree = tree2
        tree = toptree

        # only use visited hash table if all genes are present        
        if ngenes == totalgenes:
            visited2 = visited
        else:
            # otherwise use a new temp hash table
            visited2 = {}
        
        tree, logl = searchExhaustive(conf, distmat2, labels2, 
                                      tree, stree, gene2species, params,
                                      visited=visited2,
                                      depth=conf["depth"])
            
            
        if logl >= toplogl:
            toplogl = logl
            toptree = tree
        tree = toptree
        
        
        debug()
    
    visited.update(visited2)
    
    return tree, toplogl
Example #7
0
def searchMCMC(conf, distmat, labels, stree, gene2species, params,
               initTree=None, visited=None, proposeFunc=proposeTree2):
    if visited == None:
        visited = {}
    
    
    this = util.Bundle(
        nold=0,
        toplogl = -util.INF,
        toptree = None,
        iter=0)
    
    
    # init with NJ    
    if initTree != None:
        tree = initTree
    else:
        tree = phylo.neighborjoin(distmat, labels)
        tree = phylo.reconRoot(tree, stree, gene2species)
        Spidir.setTreeDistances(conf, tree, distmat, labels)

    # init likelihood score
    this.toplogl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)
    this.toptree = tree
    
    # store tree in visited
    addVisited(conf, visited, tree, gene2species)
    
    # show initial tree
    printMCMC(conf, 0, tree, stree, gene2species, visited)
    
    
    # proposal function
    def propose(chain, tree):
        tree2 = proposeFunc(conf, tree,  distmat, labels, 
                            stree, gene2species, params, visited)
        
        # check visited dict
        thash = phylo.hash_tree(tree2)
        if thash in visited:
            logl, tree2, count = visited[thash]
            #this.nold += 1
        else:
            Spidir.setTreeDistances(conf, tree2, distmat, labels)
            logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params)
            this.nold = 0
        
        addVisited(conf, visited, tree2, gene2species, thash)
        
        # best yet tree
        if logl > this.toplogl:
            printMCMC(conf, "%d:%d" % (chain.name, this.iter), 
                      tree2, stree, gene2species, visited)
            this.toplogl = logl
            this.toptree = tree2.copy()
            
            # move some other chains to best state
            #chains2 = sorted(chains, key=lambda x: x.logl)
            #for chain in chains2[:1]:
            #    chain.state = this.toptree.copy()
            #    chain.logl = this.toplogl
        
        # alter logl to influence search only
        #chain.relax = conf["speedup"] * this.nold
               
        return tree2, logl
        
    # init chains    
    chains = []
    for i in range(conf["nchains"]):
        chains.append(McmcChain(i, tree.copy(), this.toplogl, propose))
    
    
    # run chains
    for i in xrange(1, conf["iters"]):
        this.iter += 1
        
        for chain in chains:
            chain.step()   
   

    return this.toptree, this.toplogl
Example #8
0
def searchHillClimb(conf, distmat, labels, stree, gene2species, params,
               initTree=None, visited=None):

    if visited == None:
        visited = {}
    
    # init with NJ    
    if initTree != None:
        tree = initTree
    else:
        #tree = bionj.bionj(labels=labels, distmat=distmat, verbose=False)
        tree = phylo.neighborjoin(distmat, labels)
        tree = phylo.reconRoot(tree, stree, gene2species)
        Spidir.setTreeDistances(conf, tree, distmat, labels)

    # init likelihood score
    logl = treeLogLikelihood(conf, tree, stree, gene2species, params)

    # store tree in visited
    addVisited(conf, visited, tree, gene2species)
    
    stuck = False
        
    for i in range(conf["hilliters"]):
        printMCMC(conf, i, tree, stree, gene2species, visited)
        
        proposals = getProposals(conf, tree, distmat, labels, 
                                 stree, gene2species, params, visited, stuck)
        
        util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals))
        print
        
        # determine which proposals to use
        edgeset = set()
        proposals2 = []
        for logl2, edge, change in proposals:
            if edge in edgeset:
                continue
            proposals2.append([logl2, edge, change])
            
            edgeset.add((getNniUncle(tree, edge[0], edge[1]), edge[1]))
            edgeset.add((edge[0].children[change], edge[0]))
            edgeset.add((edge[0], edge[1]))
        
        util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals2))
        print
        
        heat = 1.0
        start = 0
        while start < len(proposals2):
            nproposals = int(math.ceil(len(proposals2) * heat))
            
            # apply proposals
            for logl3, edge, change in proposals2[start:start+nproposals]:
                proposeNni(tree, edge[0], edge[1], change)
            tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True)
            
            # calc likelihood
            thash = phylo.hash_tree(tree2)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                          gene2species, params)
                stuck = False
            else:
                logl2 = visited[thash][0]
                
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                          gene2species, params)
                
                if nproposals == 1:
                    stuck = True
            
            addVisited(conf, visited, tree2, gene2species, thash)
            
            
            debug("logl2", logl2)
            
            if logl2 > logl:
                logl = logl2
                tree = tree2
                break
            
            if nproposals == 1:
                logl = logl2
                tree = tree2
                break
            
            heat *= .5
            
            # undo reversals
            for logl3, edge, change in util.reverse(proposals2[start:start+nproposals]):
                proposeNni(tree, edge[0], edge[1], change)
        
        debug("start:", start)
        debug("swaps:", nproposals)
        debug("heat:", heat)
        debug("stuck:", stuck)

    
    items = visited.items()
    i = util.argmaxfunc(lambda x: x[1][0], items)
    thash, (logl, tree, count) = items[i]
    return tree, logl
Example #9
0
def proposeTree3(conf, tree,  distmat, labels, 
                  stree, gene2species, params, visited):
    toplogl = tree.data["logl"]
    toptree = tree.copy()
    
    tree = tree.copy()
    
    nodes = tree.nodes.values()
    nodes.remove(tree.root)
    weights = [1 for x in nodes] #[x.data["error"] for x in nodes]
    badgene = nodes[stats.sample(weights)]
    
    
    # detemine distance from badgene to everyone else
    dists = util.Dict(default=-util.INF)
    def walk(node, dist):
        dists[node.name] = dist
        for child in node.children:
            walk(child, dist + child.dist)
    walk(badgene, 0)
    seen = set([badgene])
    node = badgene.parent
    dist = badgene.dist
    while node != None:        
        for child in node.children:
            if child not in seen:
                walk(child, dist)
        seen.add(node)
        dist +=  node.dist
        node = node.parent
    
    tree1, tree2 = splitTree(tree, badgene, badgene.parent)
    
    names = tree1.nodes.keys()
    names.remove(tree1.root.name)
    #names.sort(key=lambda x: dists[x])
    random.shuffle(names)
    
    
    for name in names[:min(len(names), conf["regraftloop"])]:
        tree = tree1.copy()
        node = tree.nodes[name]
        
        #print "p3>>", node.name, node.parent.name
        regraftTree(tree, tree2.copy(), node, node.parent)
        
        thash = phylo.hash_tree(tree)
        
        if thash not in visited:        
            Spidir.setTreeDistances(conf, tree, distmat, labels)
            logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)
        addVisited(conf, visited, tree, gene2species, thash)
        logl, tree, count = visited[thash]
        
        if logl > toplogl:
            toplogl = logl
            toptree = tree
            
            # try returning immediately
            #return toptree

    
    assert toptree != None
    
    return toptree
Example #10
0
tree = readTree("../test/0.nt.tree")
stree = readTree("../test/flies.stree")
gene2species = genomeutil.readGene2species("../test/flies.smap")
params = Spidir.readParams("../test/flies.nt.param")

drawTree(tree)

print sum(x.dist for x in tree)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.estGeneRate(tree, stree, params, gene2species)
print Spidir.Likelihood.getBaserate(tree,
                                    stree,
                                    params,
                                    gene2species=gene2species)

conf = {"python_only": True, "famprob": True}
print Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)

conf = {}
#generate = Spidir.estGeneRate(tree, stree, params, gene2species)

for generate in frange(1.5, 2.3, .05):
    print generate, Spidir.treeLogLikelihood(conf,
                                             tree,
                                             stree,
                                             gene2species,
                                             params,
                                             baserate=generate)