def _reroot_helper(self, gtree, newCopy=True, returnEdge=False): """ Yields rerooted trees. Adapted from phylo.recon_root. """ # make a consistent unrooted copy of gene tree if newCopy: gtree = gtree.copy() if len(gtree.leaves()) == 2: raise StopIteration oldroot = gtree.root.name treelib.unroot(gtree, newCopy=False) treelib.reroot(gtree, gtree.nodes[sorted(gtree.leaf_names())[0]].parent.name, onBranch=False, newCopy=False) # make rerooting order consistent using hash ordering phylo.hash_order_tree(gtree, self.gene2species) # get list of edges to root on edges = [] def walk(node): edges.append((node, node.parent)) if not node.is_leaf(): node.recurse(walk) edges.append((node, node.parent)) for child in gtree.root.children: walk(child) # try initial root treelib.reroot(gtree, edges[0][0].name, newCopy=False) gtree.rename(gtree.root.name, oldroot) if returnEdge: yield gtree, edges[0] else: yield gtree rootedge = sorted(edges[0]) # try rerooting on everything for edge in edges[1:]: if sorted(edge) == rootedge: continue rootedge = sorted(edge) node1, node2 = edge if node1.parent != node2: node1, node2 = node2, node1 assert node1.parent == node2, "%s %s" % (node1.name, node2.name) # new root and cost treelib.reroot(gtree, node1.name, newCopy=False, keepName=True) if returnEdge: yield gtree, edge else: yield gtree
def draw_raxml_tree(tr, adef): util.tic("Tree to string...") treestr = raxml.tree_to_string(tr, adef) util.toc() util.tic("Drawing tree...") T = treelib.parse_newick(treestr) T2 = treelib.unroot(T) treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5) util.toc()
def prob_alignment_nooptimize(alnfile, partfile, coal_tree, rates, freqs, alphas, threads=1, seed=ALIGNMENT_SEED, eps=0.1): """ This function implements the pll function. It computes the log likelihood of alignment data given the coal_tree without optimizing parameters. Mathematically, it computes: P(A | T^G, t^G). rates, freqs, alphas -- parameters in pll computation alnfile -- alignment file partfile -- partition file coal_tree -- coalescent tree """ # convert coal_tree to filename tree_temp = tempfile.NamedTemporaryFile(delete=False) tree = treelib.unroot(coal_tree, newCopy=True) # unrooted tree required for ML tree.write(tree_temp, oneline=True) tree_temp.close() tree_filename = tree_temp.name # initialize pll instance pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed) # initialize pll with previously optimized parameters for i in range(pll.get_number_of_partitions()): pll.set_alpha(alphas[i], i, True) pll.set_frequencies(freqs[i], i, True) if pll.is_dna(i): pll.set_rates(rates[i], i, True) # set likelihood convergence pll.set_epsilon(eps) # do not optimize any of the parameters # pll.optimise(False, False, False, False) # get (log) likelihood prob = pll.get_likelihood() os.remove(tree_filename) return prob
def prob_alignment(alnfile, partfile, coal_tree, threads=1, seed=int("0xDEADBEEF", 16), eps=0.1, opt_branches=False): """ This function implements the pll function. It optimize the alpha, rates and frequencies, and use these parameters to compute the log likelihood of alignment data given the coal_tree. This function is not used because it is computationally inefficient to optimize the parameters. """ # convert coal_tree to filename tree_temp = tempfile.NamedTemporaryFile(delete=False) tree = treelib.unroot(coal_tree, newCopy=True) # unrooted tree required for ML tree.write(tree_temp, oneline=True) tree_temp.close() tree_filename = tree_temp.name # initialize pll instance pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed) # tell pll to optimize all model parameters for i in range(pll.get_number_of_partitions()): pll.set_optimisable_alpha(i, True) pll.set_optimisable_frequencies(i, True) if pll.is_dna(i): pll.set_optimisable_rates(i, True) # set likelihood convergence pll.set_epsilon(eps) # optimize the model pll.optimise(True, True, True, opt_branches) # rates, freqs, alphas, branches # get (log) likelihood prob = pll.get_likelihood() os.remove(tree_filename) return prob
def optimize_parameters(alnfile, partfile, coal_tree, threads=1, seed=ALIGNMENT_SEED, eps=0.1): """ The function takes in alignment file, partitions file, coal_tree, and return the rates, freqs, alphas after optimization. These parameters are used when the alignment probability is calculated. """ rates = [] freqs = [] alphas = [] # convert coal_tree to filename tree_temp = tempfile.NamedTemporaryFile(delete=False) tree = treelib.unroot(coal_tree, newCopy=True) # unrooted tree required for ML tree.write(tree_temp, oneline=True) tree_temp.close() tree_filename = tree_temp.name # initialize pll instance pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed) # set likelihood convergence pll.set_epsilon(eps) # optimize rates, freqs, alphas, and branches pll.optimise(True, True, True, True) # store optimal parameters for i in range(pll.get_number_of_partitions()): rates.append(pll.get_rates_vector(i)) freqs.append(pll.get_frequencies_vector(i)) alphas.append(pll.get_alpha(i)) os.remove(tree_filename) return rates, freqs, alphas
def recon_root(gtree, stree, gene2species = gene2species, rootby = "duploss", newCopy=True): """Reroot a tree by minimizing the number of duplications/losses/both""" # make a consistent unrooted copy of gene tree if newCopy: gtree = gtree.copy() if len(gtree.leaves()) == 2: return treelib.unroot(gtree, newCopy=False) treelib.reroot(gtree, gtree.nodes[sorted(gtree.leaf_names())[0]].parent.name, onBranch=False, newCopy=False) # make recon root consistent for rerooting tree of the same names # TODO: there is the possibility of ties, they are currently broken # arbitrarily. In order to make comparison of reconRooted trees with # same gene names accurate, hashOrdering must be done, for now. hash_order_tree(gtree, gene2species) # get list of edges to root on edges = [] def walk(node): edges.append((node, node.parent)) if not node.is_leaf(): node.recurse(walk) edges.append((node, node.parent)) for child in gtree.root.children: walk(child) # try initial root and recon treelib.reroot(gtree, edges[0][0].name, newCopy=False) recon = reconcile(gtree, stree, gene2species) events = label_events(gtree, recon) # find reconciliation that minimizes loss minroot = edges[0] rootedge = sorted(edges[0]) if rootby == "dup": cost = count_dup(gtree, events) elif rootby == "loss": cost = len(find_loss(gtree, stree, recon)) elif rootby == "duploss": cost = count_dup_loss(gtree, stree, recon, events) else: raise "unknown rootby value '%s'" % rootby mincost = cost # try rooting on everything for edge in edges[1:]: if sorted(edge) == rootedge: continue rootedge = sorted(edge) node1, node2 = edge if node1.parent != node2: node1, node2 = node2, node1 assert node1.parent == node2, "%s %s" % (node1.name, node2.name) # uncount cost if rootby in ["dup", "duploss"]: if events[gtree.root] == "dup": cost -= 1 if events[node2] == "dup": cost -= 1 if rootby in ["loss", "duploss"]: cost -= len(find_loss_under_node(gtree.root, recon)) cost -= len(find_loss_under_node(node2, recon)) # new root and recon treelib.reroot(gtree, node1.name, newCopy=False) recon[node2] = reconcile_node(node2, stree, recon) recon[gtree.root] = reconcile_node(gtree.root, stree, recon) events[node2] = label_events_node(node2, recon) events[gtree.root] = label_events_node(gtree.root, recon) if rootby in ["dup", "duploss"]: if events[node2] == "dup": cost += 1 if events[gtree.root] == "dup": cost += 1 if rootby in ["loss", "duploss"]: cost += len(find_loss_under_node(gtree.root, recon)) cost += len(find_loss_under_node(node2, recon)) # keep track of min cost if cost < mincost: mincost = cost minroot = edge # root tree by minroot if edge != minroot: node1, node2 = minroot if node1.parent != node2: node1, node2 = node2, node1 assert node1.parent == node2 treelib.reroot(gtree, node1.name, newCopy=False) return gtree
def least_square_error(tree, distmat, genes, forcePos=True, weighting=False): """Least Squared Error algorithm for phylogenetic reconstruction""" # use SCIPY to perform LSE import scipy import scipy.linalg def makeVector(array): """convience function for handling different configurations of scipy""" if len(array.shape) == 2: if array.shape[0] == 1: return array[0] else: return scipy.transpose(array)[0] else: return array if treelib.is_rooted(tree): rootedge = sorted([x.name for x in tree.root.children]) treelib.unroot(tree, newCopy=False) else: rootedge = None # create pairwise dist array dists = [] for i in xrange(len(genes)): for j in xrange(i+1, len(genes)): dists.append(distmat[i][j]) # create topology matrix topmat, edges = makeTopologyMatrix(tree, genes) # setup matrix and vector if weighting: topmat2 = scipy.array([[util.safediv(x, math.sqrt(dists[i]), 0) for x in row] for i, row in enumerate(topmat)]) paths = scipy.array(map(math.sqrt, dists)) else: topmat2 = scipy.array(topmat) paths = scipy.array(dists) # solve LSE edgelens, resids, rank, singlars = scipy.linalg.lstsq(topmat2, paths) # force non-negative branch lengths if forcePos: edgelens = [max(float(x), 0) for x in makeVector(edgelens)] else: edgelens = [float(x) for x in makeVector(edgelens)] # calc path residuals (errors) paths2 = makeVector(scipy.dot(topmat2, edgelens)) resids = (paths2 - paths).tolist() paths = paths.tolist() # set branch lengths setBranchLengths(tree, edges, edgelens, paths, resids, topmat=topmat, rootedge=rootedge) return util.Bundle(resids=resids, paths=paths, edges=edges, topmat=topmat)
def phyml(seqs, verbose=True, args=None, usertree=None, seqtype="pep", saveOutput="", bootiter=0, opttree=True, optbranches=True, nrates=4): phylip.validate_seqs(seqs) cwd = phylip.create_temp_dir() util.tic("phyml on %d of length %d" % (len(seqs), len(seqs.values()[0]))) # create input labels = phylip.write_phylip_align(file("infile", "w"), seqs) util.write_list(file("labels", "w"), labels) options = "y" # only bootstrap when iterations are above 1 if bootiter == 1: bootiter = 0 if usertree != None: usertree = treelib.unroot(usertree) phylip.write_in_tree("intree", usertree, labels) treefile = "intree" else: treefile = "BIONJ" optimize = "" if opttree: optimize += "y " else: optimize += "n " if optbranches: optimize += "y " else: optimize += "n " if args == None: if seqtype == "dna": args = "infile 0 s 1 %d HKY e e %d e %s %s" % \ (bootiter, nrates, treefile, optimize) elif seqtype == "pep": args = "infile 1 s 1 %d JTT e %d e %s %s" % \ (bootiter, nrates, treefile, optimize) else: assert False, "unknown sequence type '%s'" % seqtype phylip.exec_phylip("phyml %s" % args, options, verbose) # parse tree tree = phylip.read_out_tree("infile_phyml_tree.txt", labels) # parse likelihood tree.data["logl"] = float(file("infile_phyml_lk.txt").read()) if saveOutput != "": phylip.save_temp_dir(cwd, saveOutput) else: phylip.cleanup_temp_dir(cwd) util.toc() return tree
raxml.optimize_model(adef, tr) util.toc() # draw_raxml_tree(tr, adef) util.tic("Getting parameters for LH...") bestVector, bestLH, weightSum = raxml.compute_best_LH(tr) util.log("bestLH: %.3f" % bestLH) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash) r, w = os.pipe() fr, fw = os.fdopen(r, "r"), os.fdopen(w, "w") tree.write(out, oneline=True)
def draw_raxml_tree(self, *args, **kargs): """Draw raxml tr -- adef and tr must have been previously defined""" treestr = raxml.tree_to_string(self.tr, self.adef) tree = treelib.parse_newick(treestr) treelib.draw_tree(treelib.unroot(tree), *args, **kargs)
treefile = args[0] seqfile = util.replace_ext(treefile, options.treeext, options.alignext) out = util.open_stream(options.output, 'w') util.tic("Initializing RAXML and optimizing...") module = raxml.RAxML() module.optimize_model(treefile, seqfile, options.extra) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash) tree.write(out, oneline=True); out.write('\n'); out.flush() util.tic("Computing LH...") p, Dlnl = module.compute_lik_test(tree) util.log("pvalue: %.3f, Dlnl: %.3f" % (p, Dlnl))