예제 #1
0
    def __init__(self, filename=None, interval=None, restore=True):
        """
        Set the checkpointing filename and time interval.

        Parameters
        ----------
        filename
            name of the file to which data will be written. If None, no
            checkpointing will be done.
        interval
            time expressed in seconds
        restore
            flag to restore from this filename or not. will be set to 0 after
            restoration

        """
        self.checkpointer = checkpointing.Checkpointer(filename, interval)
        self.restore = restore
예제 #2
0
    def trex(
        self,
        a=8,
        k=1000,
        start=None,
        order=None,
        return_all=False,
        filename=None,
        interval=None,
        ui=None,
    ):
        """TrexML policy for tree sampling - all trees up to size 'a' and
        then keep no more than 'k' best trees at each tree size.
        'order' is an optional list of tip names.
        'start' is an optional list of initial trees.  Each of the trees must
        contain the same tips.
        'filename' and 'interval' control checkpointing.

        Advanced step-wise addition algorithm
        M. J. Wolf, S. Easteal, M. Kahn, B. D. McKay, and L. S. Jermiin.
        Trexml: a maximum-likelihood approach for extensive tree-space
        exploration.
        Bioinformatics, 16(4):383 94, 2000."""

        checkpointer = checkpointing.Checkpointer(filename, interval)
        if checkpointer.available():
            (init_tree_size, fixed_names, trees) = checkpointer.load()
            names = self._consistentNameOrder(fixed_names, order)
        elif start is not None:
            if not isinstance(start, list):
                start = [start]
            fixed_names = start[0].get_tip_names()
            names = self._consistentNameOrder(fixed_names, order)
            trees = []
            for tree in start:
                # check the start tree represents a subset of tips
                assert set(tree.get_tip_names()) < set(
                    self.names
                ), "Starting tree names not a subset of the sequence names"

                (ancestry, fixed_names2,
                 lengths) = tree2ancestry(tree, order=fixed_names)
                assert fixed_names2 == fixed_names
                trees.append((None, None, ancestry))
            init_tree_size = len(fixed_names)
        else:
            trees = [(None, None, numpy.identity(3, int))]
            names = self._consistentNameOrder([], order)
            init_tree_size = 3

        tree_size = len(names)
        assert tree_size > 3
        if a > tree_size:
            a = tree_size
        if a < 4:
            a = 4

        # All trees of size a-1, no need to compare them
        for n in range(init_tree_size + 1, a):
            trees2 = []
            for (err2, lengths2, ancestry) in trees:
                for split_edge in range(len(ancestry)):
                    ancestry2 = grown(ancestry, split_edge)
                    trees2.append((None, None, ancestry2))
            trees = trees2
            init_tree_size = n

        # Pre calculate how much work is to be done, for progress display
        tree_count = len(trees)
        total_work = 0
        work_done = [0] * (init_tree_size + 1)
        for n in range(init_tree_size + 1, tree_size + 1):
            evals = tree_count * (n * 2 - 5)
            total_work += evals * n
            tree_count = min(k, evals)
            work_done.append(total_work)

        # For each tree size, grow at each edge of each tree. Keep best k.
        for n in range(init_tree_size + 1, tree_size + 1):
            evaluate = self.make_tree_scorer(names[:n])

            def grown_tree(spec):
                (tree_ordinal, tree, split_edge) = spec
                (old_err, old_lengths, old_ancestry) = tree
                ancestry = grown(old_ancestry, split_edge)
                (err, lengths) = evaluate(ancestry)
                return (err, tree_ordinal, split_edge, lengths, ancestry)

            specs = [(i, tree, edge) for (i, tree) in enumerate(trees)
                     for edge in range(n * 2 - 5)]

            candidates = ui.imap(
                grown_tree,
                specs,
                noun=("%s leaf tree" % n),
                start=work_done[n - 1] / total_work,
                end=work_done[n] / total_work,
            )

            best = ismallest(candidates, k)

            trees = [(err, lengths, ancestry)
                     for (err, parent_ordinal, split_edge, lengths,
                          ancestry) in best]

            checkpointer.record((n, names[:n], trees))

        results = (self.result2output(err, ancestry, lengths, names)
                   for (err, lengths, ancestry) in trees)
        if return_all:
            result = self.results2output(results)
        else:
            result = next(results)
        return result