def searchRegraft(conf, distmat, labels, stree, gene2species, params, initTree=None, visited=None, proposeFunc=proposeTree2): if visited == None: visited = {} # init with NJ if initTree != None: tree = initTree else: tree = phylo.neighborjoin(distmat, labels) tree = phylo.reconRoot(tree, stree, gene2species) Spidir.setTreeDistances(conf, tree, distmat, labels) # init likelihood score logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) # store tree in visited addVisited(conf, visited, tree, gene2species) # show initial tree printMCMC(conf, 0, tree, stree, gene2species, visited) for i in xrange(conf["regrafts"]): tree = proposeTree3(conf, tree, distmat, labels, stree, gene2species, params, visited) if tree.data["logl"] > logl: printMCMC(conf, i, tree, stree, gene2species, visited) logl = tree.data["logl"] return tree, tree.data["logl"]
def propose(chain, tree): tree2 = proposeFunc(conf, tree, distmat, labels, stree, gene2species, params, visited) # check visited dict thash = phylo.hash_tree(tree2) if thash in visited: logl, tree2, count = visited[thash] #this.nold += 1 else: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) this.nold = 0 addVisited(conf, visited, tree2, gene2species, thash) # best yet tree if logl > this.toplogl: printMCMC(conf, "%d:%d" % (chain.name, this.iter), tree2, stree, gene2species, visited) this.toplogl = logl this.toptree = tree2.copy() # move some other chains to best state #chains2 = sorted(chains, key=lambda x: x.logl) #for chain in chains2[:1]: # chain.state = this.toptree.copy() # chain.logl = this.toplogl # alter logl to influence search only #chain.relax = conf["speedup"] * this.nold return tree2, logl
def getProposals(conf, tree, distmat, labels, stree, gene2species, params, visited, stuck=False): # TODO: handle root edges # try all NNI # find edges for NNI nodes = tree.nodes.values() nodes = filter(lambda x: not x.isLeaf() and x != tree.root and x not in tree.root.children, nodes) edges = [(node, node.parent) for node in nodes] edges.append(tuple(tree.root.children)) treelib.drawTreeNames(tree, minlen=5, maxlen=5, out=sys.stderr) util.printcols(util.map2(lambda x: x.name, edges), out=sys.stderr) proposals = [] for edge in edges: for change in (0,1): proposeNni(tree, edge[0], edge[1], change) tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True) thash = phylo.hash_tree(tree2) if thash not in visited: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) visited[thash] = [logl, tree2, 1] proposals.append([logl, edge, change]) else: visited[thash][2] += 1 logl = visited[thash][0] if not stuck: proposals.append([logl, edge, change]) # switch branch back proposeNni(tree, edge[0], edge[1], change) proposals.sort(key=lambda x: x[0], reverse=True) return proposals
#!/usr/bin/env python from rasmus.common import * from rasmus.bio import phylo import Spidir import Spidir.Likelihood tree = readTree("../test/0.nt.tree") stree = readTree("../test/flies.stree") gene2species = genomeutil.readGene2species("../test/flies.smap") params = Spidir.readParams("../test/flies.nt.param") drawTree(tree) print sum(x.dist for x in tree) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.Likelihood.getBaserate(tree, stree, params, gene2species=gene2species) conf = {"python_only": True, "famprob": True} print Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) conf = {} #generate = Spidir.estGeneRate(tree, stree, params, gene2species) for generate in frange(1.5, 2.3, .05): print generate, Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params, baserate=generate)
def searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params, depth=2, visited=None, visited2=None, topDepth=True, toplogl=None, short=False): if visited == None: visited = {} if visited2 == None: visited2 = {} tree = tree.copy() # find initial logl thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) visited[thash] = [logl, tree.copy(), 1] drawTreeLogl(tree) else: logl = visited[thash][0] if toplogl == None: toplogl = [logl] debug(" " * (depth*2), "(%d)" % len(visited)) sys.stdout.flush() if depth < 1: return tree, logl # try all NNI # find edges for NNI nodes = tree.nodes.values() nodes = filter(lambda x: not x.isLeaf() and x != tree.root and \ x.parent != tree.root, nodes) edges = [(node, node.parent) for node in nodes] for edge in edges: for change in (0,1): proposeNni(tree, edge[0], edge[1], change) thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) visited[thash] = [logl, tree.copy(), 1] else: logl = visited[thash][0] if logl > toplogl[0]: toplogl[0] = logl if short: return tree, logl else: printMCMC(conf, "N/A", tree, stree, gene2species, visited) if (thash not in visited2 or \ depth > visited2[thash]) and \ logl - toplogl[0] >= conf["eprune"]: visited2[thash] = depth # dig deeper if depth > 1: tree2, logl2 = searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params, depth=depth-1, visited=visited, visited2=visited2, topDepth=False, toplogl=toplogl, short=short) if short and tree2 != None: return tree2, logl2 # switch branch back proposeNni(tree, edge[0], edge[1], change) # debug if topDepth: items = visited.items() i = util.argmaxfunc(lambda x: x[1][0], items) thash, (logl, tree, count) = items[i] return tree, logl else: return None, None
def searchGreedy(conf, distmat, labels, stree, gene2species, params, visited=None): if visited == None: visited = {} totalgenes = len(labels) ngenes = 2 # create initial 2 gene tree (labels[0], labels[1]) tree = treelib.Tree() tree.make_root() tree.add_child(tree.root, treelib.TreeNode(labels[0])) tree.add_child(tree.root, treelib.TreeNode(labels[1])) for ngenes in xrange(2, totalgenes): debug("adding", labels[ngenes]) toplogl = -util.INF toptree = None distmat2 = matrixlib.submatrix(distmat, range(ngenes+1), range(ngenes+1)) labels2 = labels[:ngenes+1] # place new gene on every branch for name in tree.nodes: tree2 = tree.copy() node = tree2.nodes[name] if node == tree2.root: newnode = treelib.TreeNode(tree2.new_name()) tree2.add_child(newnode, tree2.root) tree2.root = newnode tree2.add_child(newnode, treelib.TreeNode(labels[ngenes])) else: parent = node.parent tree2.remove(node) newnode = treelib.TreeNode(tree2.new_name()) tree2.add_child(parent, newnode) tree2.add_child(newnode, node) tree2.add_child(newnode, treelib.TreeNode(labels[ngenes])) #tree2 = phylo.reconRoot(tree2, stree, gene2species) Spidir.setTreeDistances(conf, tree2, distmat2, labels2) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) if logl >= toplogl: toplogl = logl toptree = tree2 tree = toptree # only use visited hash table if all genes are present if ngenes == totalgenes: visited2 = visited else: # otherwise use a new temp hash table visited2 = {} tree, logl = searchExhaustive(conf, distmat2, labels2, tree, stree, gene2species, params, visited=visited2, depth=conf["depth"]) if logl >= toplogl: toplogl = logl toptree = tree tree = toptree debug() visited.update(visited2) return tree, toplogl
def searchMCMC(conf, distmat, labels, stree, gene2species, params, initTree=None, visited=None, proposeFunc=proposeTree2): if visited == None: visited = {} this = util.Bundle( nold=0, toplogl = -util.INF, toptree = None, iter=0) # init with NJ if initTree != None: tree = initTree else: tree = phylo.neighborjoin(distmat, labels) tree = phylo.reconRoot(tree, stree, gene2species) Spidir.setTreeDistances(conf, tree, distmat, labels) # init likelihood score this.toplogl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) this.toptree = tree # store tree in visited addVisited(conf, visited, tree, gene2species) # show initial tree printMCMC(conf, 0, tree, stree, gene2species, visited) # proposal function def propose(chain, tree): tree2 = proposeFunc(conf, tree, distmat, labels, stree, gene2species, params, visited) # check visited dict thash = phylo.hash_tree(tree2) if thash in visited: logl, tree2, count = visited[thash] #this.nold += 1 else: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) this.nold = 0 addVisited(conf, visited, tree2, gene2species, thash) # best yet tree if logl > this.toplogl: printMCMC(conf, "%d:%d" % (chain.name, this.iter), tree2, stree, gene2species, visited) this.toplogl = logl this.toptree = tree2.copy() # move some other chains to best state #chains2 = sorted(chains, key=lambda x: x.logl) #for chain in chains2[:1]: # chain.state = this.toptree.copy() # chain.logl = this.toplogl # alter logl to influence search only #chain.relax = conf["speedup"] * this.nold return tree2, logl # init chains chains = [] for i in range(conf["nchains"]): chains.append(McmcChain(i, tree.copy(), this.toplogl, propose)) # run chains for i in xrange(1, conf["iters"]): this.iter += 1 for chain in chains: chain.step() return this.toptree, this.toplogl
def searchHillClimb(conf, distmat, labels, stree, gene2species, params, initTree=None, visited=None): if visited == None: visited = {} # init with NJ if initTree != None: tree = initTree else: #tree = bionj.bionj(labels=labels, distmat=distmat, verbose=False) tree = phylo.neighborjoin(distmat, labels) tree = phylo.reconRoot(tree, stree, gene2species) Spidir.setTreeDistances(conf, tree, distmat, labels) # init likelihood score logl = treeLogLikelihood(conf, tree, stree, gene2species, params) # store tree in visited addVisited(conf, visited, tree, gene2species) stuck = False for i in range(conf["hilliters"]): printMCMC(conf, i, tree, stree, gene2species, visited) proposals = getProposals(conf, tree, distmat, labels, stree, gene2species, params, visited, stuck) util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals)) print # determine which proposals to use edgeset = set() proposals2 = [] for logl2, edge, change in proposals: if edge in edgeset: continue proposals2.append([logl2, edge, change]) edgeset.add((getNniUncle(tree, edge[0], edge[1]), edge[1])) edgeset.add((edge[0].children[change], edge[0])) edgeset.add((edge[0], edge[1])) util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals2)) print heat = 1.0 start = 0 while start < len(proposals2): nproposals = int(math.ceil(len(proposals2) * heat)) # apply proposals for logl3, edge, change in proposals2[start:start+nproposals]: proposeNni(tree, edge[0], edge[1], change) tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True) # calc likelihood thash = phylo.hash_tree(tree2) if thash not in visited: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) stuck = False else: logl2 = visited[thash][0] Spidir.setTreeDistances(conf, tree2, distmat, labels) logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) if nproposals == 1: stuck = True addVisited(conf, visited, tree2, gene2species, thash) debug("logl2", logl2) if logl2 > logl: logl = logl2 tree = tree2 break if nproposals == 1: logl = logl2 tree = tree2 break heat *= .5 # undo reversals for logl3, edge, change in util.reverse(proposals2[start:start+nproposals]): proposeNni(tree, edge[0], edge[1], change) debug("start:", start) debug("swaps:", nproposals) debug("heat:", heat) debug("stuck:", stuck) items = visited.items() i = util.argmaxfunc(lambda x: x[1][0], items) thash, (logl, tree, count) = items[i] return tree, logl
def proposeTree3(conf, tree, distmat, labels, stree, gene2species, params, visited): toplogl = tree.data["logl"] toptree = tree.copy() tree = tree.copy() nodes = tree.nodes.values() nodes.remove(tree.root) weights = [1 for x in nodes] #[x.data["error"] for x in nodes] badgene = nodes[stats.sample(weights)] # detemine distance from badgene to everyone else dists = util.Dict(default=-util.INF) def walk(node, dist): dists[node.name] = dist for child in node.children: walk(child, dist + child.dist) walk(badgene, 0) seen = set([badgene]) node = badgene.parent dist = badgene.dist while node != None: for child in node.children: if child not in seen: walk(child, dist) seen.add(node) dist += node.dist node = node.parent tree1, tree2 = splitTree(tree, badgene, badgene.parent) names = tree1.nodes.keys() names.remove(tree1.root.name) #names.sort(key=lambda x: dists[x]) random.shuffle(names) for name in names[:min(len(names), conf["regraftloop"])]: tree = tree1.copy() node = tree.nodes[name] #print "p3>>", node.name, node.parent.name regraftTree(tree, tree2.copy(), node, node.parent) thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) addVisited(conf, visited, tree, gene2species, thash) logl, tree, count = visited[thash] if logl > toplogl: toplogl = logl toptree = tree # try returning immediately #return toptree assert toptree != None return toptree
tree = readTree("../test/0.nt.tree") stree = readTree("../test/flies.stree") gene2species = genomeutil.readGene2species("../test/flies.smap") params = Spidir.readParams("../test/flies.nt.param") drawTree(tree) print sum(x.dist for x in tree) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.estGeneRate(tree, stree, params, gene2species) print Spidir.Likelihood.getBaserate(tree, stree, params, gene2species=gene2species) conf = {"python_only": True, "famprob": True} print Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) conf = {} #generate = Spidir.estGeneRate(tree, stree, params, gene2species) for generate in frange(1.5, 2.3, .05): print generate, Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params, baserate=generate)