def __init__(self, filename=None, interval=None, restore=True): """ Set the checkpointing filename and time interval. Parameters ---------- filename name of the file to which data will be written. If None, no checkpointing will be done. interval time expressed in seconds restore flag to restore from this filename or not. will be set to 0 after restoration """ self.checkpointer = checkpointing.Checkpointer(filename, interval) self.restore = restore
def trex( self, a=8, k=1000, start=None, order=None, return_all=False, filename=None, interval=None, ui=None, ): """TrexML policy for tree sampling - all trees up to size 'a' and then keep no more than 'k' best trees at each tree size. 'order' is an optional list of tip names. 'start' is an optional list of initial trees. Each of the trees must contain the same tips. 'filename' and 'interval' control checkpointing. Advanced step-wise addition algorithm M. J. Wolf, S. Easteal, M. Kahn, B. D. McKay, and L. S. Jermiin. Trexml: a maximum-likelihood approach for extensive tree-space exploration. Bioinformatics, 16(4):383 94, 2000.""" checkpointer = checkpointing.Checkpointer(filename, interval) if checkpointer.available(): (init_tree_size, fixed_names, trees) = checkpointer.load() names = self._consistentNameOrder(fixed_names, order) elif start is not None: if not isinstance(start, list): start = [start] fixed_names = start[0].get_tip_names() names = self._consistentNameOrder(fixed_names, order) trees = [] for tree in start: # check the start tree represents a subset of tips assert set(tree.get_tip_names()) < set( self.names ), "Starting tree names not a subset of the sequence names" (ancestry, fixed_names2, lengths) = tree2ancestry(tree, order=fixed_names) assert fixed_names2 == fixed_names trees.append((None, None, ancestry)) init_tree_size = len(fixed_names) else: trees = [(None, None, numpy.identity(3, int))] names = self._consistentNameOrder([], order) init_tree_size = 3 tree_size = len(names) assert tree_size > 3 if a > tree_size: a = tree_size if a < 4: a = 4 # All trees of size a-1, no need to compare them for n in range(init_tree_size + 1, a): trees2 = [] for (err2, lengths2, ancestry) in trees: for split_edge in range(len(ancestry)): ancestry2 = grown(ancestry, split_edge) trees2.append((None, None, ancestry2)) trees = trees2 init_tree_size = n # Pre calculate how much work is to be done, for progress display tree_count = len(trees) total_work = 0 work_done = [0] * (init_tree_size + 1) for n in range(init_tree_size + 1, tree_size + 1): evals = tree_count * (n * 2 - 5) total_work += evals * n tree_count = min(k, evals) work_done.append(total_work) # For each tree size, grow at each edge of each tree. Keep best k. for n in range(init_tree_size + 1, tree_size + 1): evaluate = self.make_tree_scorer(names[:n]) def grown_tree(spec): (tree_ordinal, tree, split_edge) = spec (old_err, old_lengths, old_ancestry) = tree ancestry = grown(old_ancestry, split_edge) (err, lengths) = evaluate(ancestry) return (err, tree_ordinal, split_edge, lengths, ancestry) specs = [(i, tree, edge) for (i, tree) in enumerate(trees) for edge in range(n * 2 - 5)] candidates = ui.imap( grown_tree, specs, noun=("%s leaf tree" % n), start=work_done[n - 1] / total_work, end=work_done[n] / total_work, ) best = ismallest(candidates, k) trees = [(err, lengths, ancestry) for (err, parent_ordinal, split_edge, lengths, ancestry) in best] checkpointer.record((n, names[:n], trees)) results = (self.result2output(err, ancestry, lengths, names) for (err, lengths, ancestry) in trees) if return_all: result = self.results2output(results) else: result = next(results) return result