def _test_bounded_multicoal_tree(stree, n, T, nsamples): """test multicoal_tree""" tops = {} for i in xrange(nsamples): # use rejection sampling #tree, recon = coal.sample_bounded_multicoal_tree_reject( # stree, n, T, namefunc=lambda x: x) # sample tree tree, recon = coal.sample_bounded_multicoal_tree( stree, n, T, namefunc=lambda x: x) top = phylo.hash_tree(tree) tops.setdefault(top, [0, tree, recon])[0] += 1 tab = Table(headers=["top", "simple_top", "percent", "prob"]) for top, (num, tree, recon) in tops.items(): tree2 = tree.copy() treelib.remove_single_children(tree2) tab.add(top=top, simple_top=phylo.hash_tree(tree2), percent=num/float(nsamples), prob=exp(coal.prob_bounded_multicoal_recon_topology( tree, recon, stree, n, T))) tab.sort(col="prob", reverse=True) return tab, tops
def do_test_coal_sim(self, stree, gene2species, n, ntrees=10000, tabsize=30): """Perform a coal gene tree simulation test""" tops = [] lookup = {} util.tic("simulating %d trees" % ntrees) for i in xrange(ntrees): tree, recon = coal.sample_multicoal_tree(stree, n, namefunc=lambda x: x) tops.append(phylo.hash_tree(tree)) lookup[tops[-1]] = (tree, recon) util.toc() hist = histtab(tops) probs = [] for row in hist: tree, recon= lookup[row["item"]] try: #treelib.draw_tree_names(tree, maxlen=5) treelib.remove_single_children(tree) nodes = set(tree.postorder()) for node, snode in recon.items(): if node not in nodes: del recon[node] p = coal.prob_coal_recon_topology(tree, recon, stree, n) except: draw_tree(tree, maxlen=5, minlen=5) raise probs.append(exp(p)) return hist, probs
def process_tree(tree, stree, gene2species): #============================== # process if options.hist or options.hashes: # count topology hashes.append(phylo.hash_tree(tree, gene2species)) elif options.histsplit: # count splits for leaf in tree.leaves(): tree.rename(leaf.name, gene2species(leaf.name)) splits.extend(phylo.find_splits(tree)) elif options.dump: # dump mode dump_tree(tree) elif options.events: # add event labels assert stree is not None and gene2species is not None phylo.count_dup_loss_tree(tree, stree, gene2species) else: # default mode: display tree display_tree(tree, options, gene2species=gene2species, stree=stree)
def test_top(self): outdir = 'test/tmp/test_coal/BMC_test_top/' make_clean_dir(outdir) stree = treelib.parse_newick( "(((A:200, E:200):800, B:1000):500, (C:700, D:700):800);") n = 500 T = 2000 nsamples = 4000 # compare top hist with simpler rejection sampling tops = {} tops2 = {} for i in xrange(nsamples): # use rejection sampling tree, recon = coal.sample_bounded_multicoal_tree_reject( stree, n, T, namefunc=lambda x: x) # sample tree tree2, recon2 = coal.sample_bounded_multicoal_tree( stree, n, T, namefunc=lambda x: x) top = phylo.hash_tree(tree) top2 = phylo.hash_tree(tree2) tops.setdefault(top, [0, tree, recon])[0] += 1 tops.setdefault(top2, [0, tree2, recon2]) tops2.setdefault(top2, [0, tree2, recon2])[0] += 1 tops2.setdefault(top, [0, tree, recon]) keys = tops.keys() x = [safelog(tops[i][0], default=0) for i in keys] y = [safelog(tops2[i][0], default=0) for i in keys] self.assertTrue(stats.corr(x, y) > .9) p = Gnuplot() p.enableOutput(False) p.plot(x, y) p.plot([min(x), max(x)], [min(x), max(x)], style="lines") p.enableOutput(True) p.save(outdir + 'plot.png')
def _test_prog_infsites(): make_clean_dir("test/tmp/test_prog_infsites") run_cmd("""bin/arg-sim \ -k 40 -L 200000 \ -N 1e4 -r 1.5e-8 -m 2.5e-8 --infsites \ --ntimes 20 --maxtime 400e3 \ -o test/tmp/test_prog_infsites/0""") make_clean_dir("test/tmp/test_prog_infsites/0.sample") run_cmd("""bin/arg-sample \ -s test/tmp/test_prog_infsites/0.sites \ -N 1e4 -r 1.5e-8 -m 2.5e-8 \ --ntimes 5 --maxtime 100e3 -c 1 \ --climb 0 -n 20 --infsites \ -x 1 \ -o test/tmp/test_prog_infsites/0.sample/out""") arg = argweaver.read_arg( "test/tmp/test_prog_infsites/0.sample/out.0.smc.gz") sites = argweaver.read_sites("test/tmp/test_prog_infsites/0.sites") print "names", sites.names print noncompats = [] for block, tree in arglib.iter_local_trees(arg): tree = tree.get_tree() treelib.remove_single_children(tree) phylo.hash_order_tree(tree) for pos, col in sites.iter_region(block[0] + 1, block[1] + 1): assert block[0] + 1 <= pos <= block[1] + 1, (block, pos) split = sites_split(sites.names, col) node = arglib.split_to_tree_branch(tree, split) if node is None: noncompats.append(pos) print "noncompat", block, pos, col print phylo.hash_tree(tree) print tree.leaf_names() print "".join(col[sites.names.index(name)] for name in tree.leaf_names()) print split print print "num noncompats", len(noncompats)
def _test_prog_infsites(): make_clean_dir("test/data/test_prog_infsites") run_cmd("""bin/arg-sim \ -k 40 -L 200000 \ -N 1e4 -r 1.5e-8 -m 2.5e-8 --infsites \ --ntimes 20 --maxtime 400e3 \ -o test/data/test_prog_infsites/0""") make_clean_dir("test/data/test_prog_infsites/0.sample") run_cmd("""bin/arg-sample \ -s test/data/test_prog_infsites/0.sites \ -N 1e4 -r 1.5e-8 -m 2.5e-8 \ --ntimes 5 --maxtime 100e3 -c 1 \ --climb 0 -n 20 --infsites \ -x 1 \ -o test/data/test_prog_infsites/0.sample/out""") arg = argweaver.read_arg( "test/data/test_prog_infsites/0.sample/out.0.smc.gz") sites = argweaver.read_sites("test/data/test_prog_infsites/0.sites") print "names", sites.names print noncompats = [] for block, tree in arglib.iter_local_trees(arg): tree = tree.get_tree() treelib.remove_single_children(tree) phylo.hash_order_tree(tree) for pos, col in sites.iter_region(block[0]+1, block[1]+1): assert block[0]+1 <= pos <= block[1]+1, (block, pos) split = sites_split(sites.names, col) node = arglib.split_to_tree_branch(tree, split) if node is None: noncompats.append(pos) print "noncompat", block, pos, col print phylo.hash_tree(tree) print tree.leaf_names() print "".join(col[sites.names.index(name)] for name in tree.leaf_names()) print split print print "num noncompats", len(noncompats)
def test(self): """Test a tree search""" tree = parse_newick("((a,b),((c,d),(e,f)))") a, b = phylo.propose_random_spr(tree) phylo.perform_spr(tree, a, b) treelib.assert_tree(tree) for i in xrange(100): top1 = phylo.hash_tree(tree) s = phylo.TreeSearchSpr(tree) s.next() top2 = phylo.hash_tree(tree) self.assertNotEqual(top1, top2) s.revert() self.assertEqual(phylo.hash_tree(tree), top1)
def __repr__(self): return repr({"coal_recon": [(x.name, y.name) for x,y in self.coal_recon.iteritems()], "locus_tree": self.locus_tree.get_one_line_newick( root_data=True), "locus_top": phylo.hash_tree(self.locus_tree), "locus_recon": [(x.name, y.name) for x,y in self.locus_recon.iteritems()], "locus_events": [(x.name, y) for x,y in self.locus_events.iteritems()], "daughters": [x.name for x in self.daughters], "data": self.data})
def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes)
def arg_equal(arg, arg2): # test recomb points recombs = sorted(x.pos for x in arg if x.event == "recomb") recombs2 = sorted(x.pos for x in arg2 if x.event == "recomb") nose.tools.assert_equal(recombs, recombs2) # check local tree topologies for (start, end), tree in arglib.iter_local_trees(arg): pos = (start + end) / 2.0 arglib.remove_single_lineages(tree) tree1 = tree.get_tree() tree2 = arg2.get_marginal_tree(pos) arglib.remove_single_lineages(tree2) tree2 = tree2.get_tree() hash1 = phylo.hash_tree(tree1) hash2 = phylo.hash_tree(tree2) nose.tools.assert_equal(hash1, hash2) # check sprs sprs1 = arglib.iter_arg_sprs(arg, use_leaves=True) sprs2 = arglib.iter_arg_sprs(arg2, use_leaves=True) for (pos1, recomb1, coal1), (pos2, recomb2, coal2) in zip(sprs1, sprs2): recomb1 = (sorted(recomb1[0]), recomb1[1]) recomb2 = (sorted(recomb2[0]), recomb2[1]) coal1 = (sorted(coal1[0]), coal1[1]) coal2 = (sorted(coal2[0]), coal2[1]) # check pos, leaves, time nose.tools.assert_equal(pos1, pos2) nose.tools.assert_equal(recomb1, recomb2) nose.tools.assert_equal(coal1, coal2)
def arg_equal(arg, arg2): # test recomb points recombs = sorted(x.pos for x in arg if x.event == "recomb") recombs2 = sorted(x.pos for x in arg2 if x.event == "recomb") nose.tools.assert_equal(recombs, recombs2) # check local tree topologies for (start, end), tree in arglib.iter_local_trees(arg): pos = (start + end) / 2.0 arglib.remove_single_lineages(tree) tree1 = tree.get_tree() tree2 = arg2.get_marginal_tree(pos) arglib.remove_single_lineages(tree2) tree2 = tree2.get_tree() hash1 = phylo.hash_tree(tree1) hash2 = phylo.hash_tree(tree2) nose.tools.assert_equal(hash1, hash2) # check sprs sprs1 = arglib.iter_arg_sprs(arg, use_leaves=True) sprs2 = arglib.iter_arg_sprs(arg2, use_leaves=True) for (pos1, recomb1, coal1), (pos2, recomb2, coal2) in izip(sprs1, sprs2): recomb1 = (sorted(recomb1[0]), recomb1[1]) recomb2 = (sorted(recomb2[0]), recomb2[1]) coal1 = (sorted(coal1[0]), coal1[1]) coal2 = (sorted(coal2[0]), coal2[1]) # check pos, leaves, time nose.tools.assert_equal(pos1, pos2) nose.tools.assert_equal(recomb1, recomb2) nose.tools.assert_equal(coal1, coal2)
def __repr__(self): return repr({ "coal_recon": [(x.name, y.name) for x, y in self.coal_recon.iteritems()], "locus_tree": self.locus_tree.get_one_line_newick(root_data=True), "locus_top": phylo.hash_tree(self.locus_tree), "locus_recon": [(x.name, y.name) for x, y in self.locus_recon.iteritems()], "locus_events": [(x.name, y) for x, y in self.locus_events.iteritems()], "daughters": [x.name for x in self.daughters], "data": self.data })
def do_test_coal_sim(self, stree, gene2species, n, ntrees=10000, tabsize=30): """Perform a coal gene tree simulation test""" tops = [] lookup = {} util.tic("simulating %d trees" % ntrees) for i in xrange(ntrees): tree, recon = coal.sample_multicoal_tree(stree, n, namefunc=lambda x: x) tops.append(phylo.hash_tree(tree)) lookup[tops[-1]] = (tree, recon) util.toc() hist = histtab(tops) probs = [] for row in hist: tree, recon = lookup[row["item"]] try: #treelib.draw_tree_names(tree, maxlen=5) treelib.remove_single_children(tree) nodes = set(tree.postorder()) for node, snode in recon.items(): if node not in nodes: del recon[node] p = coal.prob_coal_recon_topology(tree, recon, stree, n) except: draw_tree(tree, maxlen=5, minlen=5) raise probs.append(exp(p)) return hist, probs
def __eq__(self, other): """x.__eq__(y) <==> x==y NOTE 1: Only the locus tree node names are allowed to change. (Internal nodes of coal_tree and stree must be identical!) NOTE 2: Data are not compared. """ def error(msg): print >> sys.stderr, msg return False # are locus_trees identical? if phylo.hash_tree(self.locus_tree) != phylo.hash_tree( other.locus_tree): return error("locus_tree mismatch") # map nodes using leaf names -- TODO : more efficient to use hash_tree_names to map? def get_leaf_dct(tree): """return dict with key=leaves, val=node""" leaves = {} for node in tree.postorder(): if node.is_leaf(): leaves[node] = [node.name] else: leaves[node] = [] for child in node.children: leaves[node].extend(leaves[child]) dct = {} for node in tree: dct[tuple(sorted(leaves[node]))] = node return dct def get_map(tree1, tree2): """remap tree1 nodes into tree2 nodes""" m = {} tree1_dict = get_leaf_dct(tree1) tree2_dict = get_leaf_dct(tree2) for leaves, node in tree1_dict.iteritems(): m[node] = tree2_dict[leaves] return m locus_map = get_map(self.locus_tree, other.locus_tree) # are locus_recon identical? locus_recon = util.mapdict(self.locus_recon, key=lambda lnode: locus_map[lnode].name, val=lambda snode: snode.name) other_locus_recon = util.mapdict(other.locus_recon, key=lambda lnode: lnode.name, val=lambda snode: snode.name) if locus_recon != other_locus_recon: return error("locus_recon mismatch") # are locus_events identical? locus_events = util.mapdict(self.locus_events, key=lambda lnode: locus_map[lnode].name) other_locus_events = util.mapdict(other.locus_events, key=lambda lnode: lnode.name) if locus_events != other_locus_events: return error("locus_events mismatch") # are daughters identical? daughters = set([locus_map[lnode].name for lnode in self.daughters]) other_daughters = set([lnode.name for lnode in other.daughters]) if daughters != other_daughters: return error("daughters mismatch") # are coal_recon identical? coal_recon = util.mapdict(self.coal_recon, key=lambda node: node.name, val=lambda lnode: locus_map[lnode].name) other_coal_recon = util.mapdict(other.coal_recon, key=lambda node: node.name, val=lambda lnode: lnode.name) if coal_recon != other_coal_recon: return error("coal_recon mismatch") # everything identical return True
def __eq__(self, other): """x.__eq__(y) <==> x==y NOTE 1: Only the locus tree node names are allowed to change. (Internal nodes of coal_tree and stree must be identical!) NOTE 2: Data are not compared. """ # are locus_trees identical? if phylo.hash_tree(self.locus_tree) != phylo.hash_tree(other.locus_tree): print >>sys.stderr, "locus_tree mismatch" return False # are locus_recon, locus_events, daughters identical? # first remap nodes using leaf names -- TODO : more efficient to use hash_tree_names to map? def get_leaf_dct(tree): """return dict with key=leaves, val=node""" leaves = {} for node in tree.postorder(): if node.is_leaf(): leaves[node] = [node.name] else: leaves[node] = [] for child in node.children: leaves[node].extend(leaves[child]) dct = {} for node in tree: dct[tuple(sorted(leaves[node]))] = node return dct def get_map(tree1, tree2): """remap tree1 nodes into tree2 nodes""" m = {} tree1_dict = get_leaf_dct(tree1) tree2_dict = get_leaf_dct(tree2) for leaves, node in tree1_dict.iteritems(): m[node] = tree2_dict[leaves] return m import sys locus_map = get_map(self.locus_tree, other.locus_tree) locus_recon = util.mapdict(self.locus_recon, key=lambda lnode: locus_map[lnode].name, val=lambda snode: snode.name) other_locus_recon = util.mapdict(other.locus_recon, key=lambda lnode: lnode.name, val=lambda snode: snode.name) if locus_recon != other_locus_recon: print >>sys.stderr, "locus_recon mismatch" return False locus_events = util.mapdict(self.locus_events, key=lambda lnode: locus_map[lnode].name) other_locus_events = util.mapdict(other.locus_events, key=lambda lnode: lnode.name) if locus_events != other_locus_events: print >>sys.stderr, "locus_events mismatch" return False daughters = set([locus_map[lnode].name for lnode in self.daughters]) other_daughters = set([lnode.name for lnode in other.daughters]) if daughters != other_daughters: print >>sys.stderr, "daughters mismatch" return False # are coal_recon identical? coal_recon = util.mapdict(self.coal_recon, key=lambda node: node.name, val=lambda lnode: locus_map[lnode].name) other_coal_recon = util.mapdict(other.coal_recon, key=lambda node: node.name, val=lambda lnode: lnode.name) if coal_recon != other_coal_recon: print >>sys.stderr, "coal_recon mismatch" return False return True
treefile = args[0] seqfile = util.replace_ext(treefile, options.treeext, options.alignext) out = util.open_stream(options.output, 'w') util.tic("Initializing RAXML and optimizing...") module = raxml.RAxML() module.optimize_model(treefile, seqfile, options.extra) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash) tree.write(out, oneline=True); out.write('\n'); out.flush() util.tic("Computing LH...") p, Dlnl = module.compute_lik_test(tree) util.log("pvalue: %.3f, Dlnl: %.3f" % (p, Dlnl))
raxml.optimize_model(adef, tr) util.toc() # draw_raxml_tree(tr, adef) util.tic("Getting parameters for LH...") bestVector, bestLH, weightSum = raxml.compute_best_LH(tr) util.log("bestLH: %.3f" % bestLH) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash) r, w = os.pipe() fr, fw = os.fdopen(r, "r"), os.fdopen(w, "w") tree.write(out, oneline=True)
def compute_cost(self, gtree): """Returns the duplication-loss-coalescence cost""" # start with locus tree equal to gene tree ltree = self.locustree treelib.set_tree_topology(ltree, gtree) # initialize search self.search.set_tree(ltree) treehash = phylo.hash_tree(ltree) uniques = set([treehash]) # initalize optimal locus tree and DLC cost ntrees = 0 minltree = ltree mincost = self._compute_cost_helper(gtree, ltree) # random values random.seed(self.seed) randvec = nprnd.random(self.niter) if NUMPY: nprnd.seed(self.seed) randvec = [random.random() for _ in xrange(self.niter)] # search locus trees for i in xrange(self.niter): # propose locus tree ltree = self.search.propose() treehash = phylo.hash_tree(ltree) if treehash in uniques and ntrees >= 0.1*i: self.search.revert() continue if phylo.robinson_foulds_error(gtree, ltree) > self.rf: self.search.revert() continue # save tree if treehash not in uniques: uniques.add(treehash) ntrees += 1 # reconroot (some percentage of the time depending on freconroot) if randvec[i] < self.freconroot: ltree, dlcost = phylo.recon_root(ltree, self.stree, self.gene2species, newCopy=True, keepName=True, returnCost=True, dupcost=self.dupcost, losscost=self.losscost) coalcost = self._compute_coalcost(gtree, ltree) cost = dlcost + coalcost else: cost = self._compute_cost_helper(gtree, ltree) # update min cost and decide how to continue proposals from here if cost < mincost: minltree = ltree if randvec[i] < self.freconroot else ltree.copy() mincost = cost else: self.search.revert() # set optimal locus tree self.locustree = minltree self.locustree.write(self.output) return mincost