def gnj(dists, keep=None, dkeep=0, ui=None): """Arguments: - dists: dict of (name1, name2): distance - keep: number of best partial trees to keep at each iteration, and therefore to return. Same as Q parameter in original GNJ paper. - dkeep: number of diverse partial trees to keep at each iteration, and therefore to return. Same as D parameter in original GNJ paper. Result: - a sorted list of (tree length, tree) tuples """ (names, d) = distanceDictTo2D(dists) if keep is None: keep = len(names) * 5 all_keep = keep + dkeep # For recognising duplicate topologies, encode partitions (ie: edges) as # frozensets of tip names, which should be quickly comparable. arbitrary_anchor = names[0] all_tips = frozenset(names) def encode_partition(tips): included = frozenset(tips) if arbitrary_anchor not in included: included = all_tips - included return included # could also convert to long int, or cache, would be faster? tips = [frozenset([n]) for n in names] nodes = [LightweightTreeTip(name) for name in names] star_tree = PartialTree(d, nodes, tips, 0.0) star_tree.topology = frozenset([]) trees = [star_tree] # Progress display auxiliary code template = ' size %%s/%s trees %%%si' % (len(names), len(str(all_keep))) total_work = 0 max_candidates = 1 total_work_before = {} for L in range(len(names), 3, -1): total_work_before[L] = total_work max_candidates = min(all_keep, max_candidates*L*(L-1)//2) total_work += max_candidates def _show_progress(): t = len(next_trees) work_done = total_work_before[L] + t ui.display(msg=template % (L, t), progress=work_done/total_work) for L in range(len(names), 3, -1): # Generator of candidate joins, best first. # Note that with dkeep>0 this generator is used up a bit at a time # by 2 different interupted 'for' loops below. candidates = uniq_neighbour_joins(trees, encode_partition) # First take up to 'keep' best ones next_trees = [] _show_progress() for pair in candidates: next_trees.append(pair) if len(next_trees) == keep: break _show_progress() # The very best one is used as an anchor for measuring the # topological distance to others best_topology = next_trees[0].topology prior_td = [len(best_topology ^ tree.topology) for tree in trees] # Maintain a separate queue of joins for each possible # topological distance max_td = (max(prior_td) + 1) // 2 queue = [deque() for g in range(max_td+1)] queued = 0 # Now take up to dkeep joins, an equal number of the best at each # topological distance, while not calculating any more TDs than # necessary. prior_td = dict(zip(map(id, trees), prior_td)) target_td = 1 while (candidates or queued) and len(next_trees) < all_keep: if candidates and not queue[target_td]: for pair in candidates: diff = pair.new_partition not in best_topology td = (prior_td[id(pair.tree)] + [-1,+1][diff]) // 2 # equiv, slower: td = len(best_topology ^ topology) // 2 queue[td].append(pair) queued += 1 if td == target_td: break else: candidates = None if queue[target_td]: next_trees.append(queue[target_td].popleft()) queued -= 1 _show_progress() target_td = target_td % max_td + 1 trees = [pair.joined() for pair in next_trees] result = [tree.asScoreTreeTuple() for tree in trees] result.sort() return ScoredTreeCollection(result)
def rnj(dists, no_negatives=True, randomize=True): """Computes a tree using the relaxed neighbor joining method Arguments: - dists: dict of (name1, name2): distance - no_negatives: negative branch lengths will be set to 0 - randomize: the algorithm will search nodes randomly until two neighbors are found. """ constructor = TreeBuilder(mutable=True).createEdge (names, d) = distanceDictTo2D(dists) nodes = [constructor([], name, {}) for name in names] while len(nodes) > 2: # Eliminate one node per iteration until 2 left num_nodes = len(nodes) # compute r (normalized), the sum of all pairwise distances # the normalization is over (num - 2), since later for a given i, j # distance(i, j) will be removed, and distance(i, i) = 0 always r = numpy.sum(d, 0) * 1./(num_nodes-2.) # find two nodes i, j that are minimize each other's # transformed distance node_indices = range(num_nodes) if randomize == True: shuffle(node_indices) chose_pair = False # coefficient used calculating transformed distances coef = num_nodes * 1./(num_nodes - 2.) for i in node_indices: # find i's closest, call it j # xformed_dists is a list of T_i,j for all j xformed_dists = coef*d[i] - r - r[i] # give distance to self a bogus but nonminimum value xformed_dists[i] = numpy.abs(xformed_dists[0])*2. +\ numpy.abs(xformed_dists[num_nodes - 1])*2. j = numpy.argmin(xformed_dists) # now find j's closest xformed_dists = coef*d[j] - r - r[j] xformed_dists[j] = numpy.abs(xformed_dists[0])*2. +\ numpy.abs(xformed_dists[num_nodes - 1])*2. # if i and j are each other's minimum, choose this (i, j) pair if i == numpy.argmin(xformed_dists): # choose these i, j chose_pair = True break if not chose_pair: raise Exception("didn't choose a pair of nodes correctly") assert i != j, (i, j) # Branch lengths from i and j to new node nodes[i].Length = 0.5 * (d[i,j] + r[i] - r[j]) nodes[j].Length = 0.5 * (d[i,j] + r[j] - r[i]) # no negative branch lengths if no_negatives: nodes[i].Length = max(0.0, nodes[i].Length) nodes[j].Length = max(0.0, nodes[j].Length) # Join i and k to make new node new_node = constructor([nodes[i], nodes[j]], None, {}) # Store new node at i new_dists = 0.5 * (d[i] + d[j] - d[i,j]) d[:, i] = new_dists d[i, :] = new_dists d[i, i] = 0.0 nodes[i] = new_node # Eliminate j d[j, :] = d[num_nodes-1, :] d[:, j] = d[:, num_nodes-1] assert d[j, j] == 0.0, d d = d[0:num_nodes-1, 0:num_nodes-1] nodes[j] = nodes[num_nodes-1] nodes.pop() # no negative branch lengths if len(nodes[0].Children) < len(nodes[1].Children): nodes.reverse() # 2 remaining nodes will be [root, extra_child] nodes[1].Length = d[0,1] if no_negatives: nodes[1].Length = max(0.0, nodes[1].Length) #Need to replace nodes[0] with new root nodes[1].Parent = nodes[0] return constructor(nodes[0].Children, 'root', {}).deepcopy()