def test_scored_trees_collection_write(self): """writes a tree collection""" sct = ScoredTreeCollection(self.rooted_trees_lengths) with TemporaryDirectory(".") as dirname: dirname = pathlib.Path(dirname) out = dirname / "collection.trees" sct.write(out)
def test_consensus_from_scored_trees_collection_ii(self): """strict consensus should handle conflicting trees""" sct = ScoredTreeCollection(list(zip([1] * 3, self.unrooted_conflicting_trees))) ct = sct.get_consensus_trees()[0] self.assertTrue(ct.same_topology(Tree("(a,b,c,d);"))) sct = ScoredTreeCollection(list(zip([1] * 3, self.rooted_conflicting_trees))) # cts = sct.get_consensus_trees(method='rooted') ct = sct.get_consensus_trees(method="rooted")[0] self.assertTrue(ct.same_topology(Tree("(a,b,c,d);")))
def test_consensus_tree_branch_lengths(self): """consensus trees should average branch lengths properly""" def get_ac(tree): for edge in tree.get_edge_vector(include_root=False): if set("ac") == set([c.name for c in edge.children]): return edge sct = ScoredTreeCollection(self.unrooted_trees_lengths) ct = sct.get_consensus_tree() maj_tree = self.unrooted_trees_lengths[0][1] # to ensure consistent comparison with majority, we root the ct same way # as maj tip_names = maj_tree.get_tip_names() ct = ct.rooted_with_tip("d") ct = ct.sorted(tip_names) self.assertTrue(abs(get_ac(ct).length - get_ac(maj_tree).length) < 1e-9) sct = ScoredTreeCollection(self.rooted_trees_lengths) ct = sct.get_consensus_tree(method="rooted") maj_tree = self.rooted_trees_lengths[0][1] self.assertTrue(abs(get_ac(ct).length - get_ac(maj_tree).length) < 1e-9)
def results2output(self, results): return ScoredTreeCollection(results)
def gnj(dists, keep=None, dkeep=0, ui=None): """Arguments: - dists: dict of (name1, name2): distance - keep: number of best partial trees to keep at each iteration, and therefore to return. Same as Q parameter in original GNJ paper. - dkeep: number of diverse partial trees to keep at each iteration, and therefore to return. Same as D parameter in original GNJ paper. Result: - a sorted list of (tree length, tree) tuples """ try: dists = dists.to_dict() except AttributeError: pass (names, d) = distance_dict_to_2D(dists) if keep is None: keep = len(names) * 5 all_keep = keep + dkeep # For recognising duplicate topologies, encode partitions (ie: edges) as # frozensets of tip names, which should be quickly comparable. arbitrary_anchor = names[0] all_tips = frozenset(names) def encode_partition(tips): included = frozenset(tips) if arbitrary_anchor not in included: included = all_tips - included return included # could also convert to long int, or cache, would be faster? tips = [frozenset([n]) for n in names] nodes = [LightweightTreeTip(name) for name in names] star_tree = PartialTree(d, nodes, tips, 0.0) star_tree.topology = frozenset([]) trees = [star_tree] # Progress display auxiliary code template = " size %%s/%s trees %%%si" % (len(names), len(str(all_keep))) total_work = 0 max_candidates = 1 total_work_before = {} for L in range(len(names), 3, -1): total_work_before[L] = total_work max_candidates = min(all_keep, max_candidates * L * (L - 1) // 2) total_work += max_candidates def _show_progress(): t = len(next_trees) work_done = total_work_before[L] + t ui.display(msg=template % (L, t), progress=work_done / total_work) for L in range(len(names), 3, -1): # Generator of candidate joins, best first. # Note that with dkeep>0 this generator is used up a bit at a time # by 2 different interupted 'for' loops below. candidates = uniq_neighbour_joins(trees, encode_partition) # First take up to 'keep' best ones next_trees = [] _show_progress() for pair in candidates: next_trees.append(pair) if len(next_trees) == keep: break _show_progress() # The very best one is used as an anchor for measuring the # topological distance to others best_topology = next_trees[0].topology prior_td = [len(best_topology ^ tree.topology) for tree in trees] # Maintain a separate queue of joins for each possible # topological distance max_td = (max(prior_td) + 1) // 2 queue = [deque() for g in range(max_td + 1)] queued = 0 # Now take up to dkeep joins, an equal number of the best at each # topological distance, while not calculating any more TDs than # necessary. prior_td = dict(list(zip(list(map(id, trees)), prior_td))) target_td = 1 while (candidates or queued) and len(next_trees) < all_keep: if candidates and not queue[target_td]: for pair in candidates: diff = pair.new_partition not in best_topology td = (prior_td[id(pair.tree)] + [-1, +1][diff]) // 2 # equiv, slower: td = len(best_topology ^ topology) // 2 queue[td].append(pair) queued += 1 if td == target_td: break else: candidates = None if queue[target_td]: next_trees.append(queue[target_td].popleft()) queued -= 1 _show_progress() target_td = target_td % max_td + 1 trees = [pair.joined() for pair in next_trees] result = [tree.asScoreTreeTuple() for tree in trees] result.sort() return ScoredTreeCollection(result)