Esempio n. 1
0
def represent_clades(tree, get_children, get_label):
    clades = {}
    for clade, children in rev_dfw_descendants(tree,
                                               get_children,
                                               include_leaves=True,
                                               queue=[tree.root]):
        cladename = get_label(tree, clade)

        if not children:
            clades[cladename] = ((cladename, ), )
            continue

        # Make unique name
        rawcladename = cladename
        seen = 0
        namefmt = '%s.%d'
        while cladename in clades:
            seen += 1
            cladename = namefmt % (rawcladename, seen)
        clade.name = cladename

        # Concatenate each children subclades (flatten grand-children)
        clades[cladename] = []
        for child in children:
            childleaves = ()
            for subclade in clades[get_label(tree, child)]:
                childleaves += subclade
            clades[cladename].append(tuple(sorted(childleaves)))
        clades[cladename] = tuple(sorted(clades[cladename], key=min))

    return clades
Esempio n. 2
0
    def walk_phylsubtree(self):
        """Return an iterator that will progress on the tree from leaves to root.
        
        It also rotates forks such that the displayed tree will show branches
        that are most distant from the root (in number of nodes) on one side.

        Taxa returned by the iterator are space-separated
        """
        logger.info("Loading species tree")
        #logger.debug(self.taxa)
        root, subtree = self.phyltree.getSubAncTree(self.taxa)

        # reorder branches in a visually nice manner:
        ladderize(subtree, root)
        #logger.debug(" ---\n subtree: ")
        #for anc, children in subtree.items():
        #    logger.debug(label_fmt % anc, children)
        get_children = lambda tree, node: tree.get(node, [])
        return rev_dfw_descendants(subtree, get_children, queue=[root])#,
Esempio n. 3
0
def maplosses(phyltree, leaf_states, root=None):
    """Simple parsimony algorithm where the (binary) character can only be
    lost (not regained)"""

    node_states = {}
    node_states.update(leaf_states)

    if root is None:
        root = phyltree.root

    loss_branches = set()

    for parent, children in rev_dfw_descendants(phyltree,
                                                get_phylchildren,
                                                queue=[root]):
        # The parent state is 0 if and only if both children are 0.
        node_states[parent] = int(any(node_states[ch] for ch in children))

        for ch in children:
            if node_states[parent] ^ node_states[ch]:
                loss_branches.add(ch)

    return loss_branches, node_states
Esempio n. 4
0
def leaf_sort(tree, root, get_children, assign_children=None, get_attribute=None,
              reverse=False):
    """Rotate sister branches according to the leaf attribute (e.g. the name).

    Sort **inplace**.

    Example of `get_attribute` to get the 'gene_name' of a leaf in a
        Dyogen ProteinTree:

        >>> get_children = lambda tree, node_and_dist: tree.data.get(node_and_dist[0], [])
        >>> get_attribute = lambda tree, node_and_dist: tree.info[node_and_dist[0]].get('gene_name')
    """

    if get_attribute is None:
        def get_attribute(tree, node):
            return node

    # It might not be necessary to reassign the children. Default is "do nothing".
    if assign_children is None:
        def assign_children(tree, node, children):
            pass

    previous_attributes = {}
    children_get_attribute = lambda child: previous_attributes[child]

    for node, children in rev_dfw_descendants(tree, get_children,
                                              include_leaves=True,
                                              queue=[root]):
        if not children:
            # This is a leaf. Store the attribute information
            previous_attributes[node] = get_attribute(tree, node)

        else:
            children.sort(key=children_get_attribute, reverse=reverse)
            assign_children(tree, node, children)
            previous_attributes[node] = min(previous_attributes.pop(ch) for ch in children)
Esempio n. 5
0
def place_single_events(n: int,
                        phyltree,
                        get_phylchildren=get_phylchildren,
                        root=None,
                        leaf_counts=None):
    """Dynamic programming approach to place n events in a Maddison manner on a
    phylogenetic tree.

    Briefly, those events are considered unique/irreversible, so if it happened
    *once* in the ancestry, it can't happen again in any descendant.

    The tree must be a strictly dichotomic tree."""
    if root is None:
        root = phyltree.root

    # Store intermediate results for each node.
    node_counts = {}
    if leaf_counts is not None:
        node_counts.update(leaf_counts)

    #seen_nodes = {k: 1 for k in leaf_counts}

    #TODO: if n is None:
    # There can't be more single events than there are leaves.
    maxn = min(n, len(phyltree.species[root]))

    # There is exactly:
    # - 1 way to place 0 event on a single branch,
    # - 1 way to place 1 event on this single branch (on the branch itself)
    # and we initialize 0 ways for all other numbers of events (because there is no subtree).
    if maxn == 0:
        return np.ones(1, dtype=int)

    init_count = np.array([1, 1] + [0] * (n - 1), dtype=int)

    #node_counts[root] = init_count.copy()  # In case the tree has 0 branches...
    # Iterate from the leaves to the root.
    for parent, children in rev_dfw_descendants(phyltree,
                                                get_phylchildren,
                                                include_leaves=False,
                                                queue=[root]):
        ## Check that this node name is unique, or make it so.
        ## Reminder: with myPhylTree, it is necessary unique.
        #try:
        #    # Assuming it is a string
        #    parent += '.%d' % seen_nodes[parent]
        #    seen_nodes[parent] += 1
        #except KeyError:
        #    seen_nodes[parent] = 1

        logger.debug('* %r -> %s', parent, children)
        #assert len(children) == 2, "Not implemented for non dichotomic trees."\
        #        " (node %s)" % parent

        #nch = len(children)
        chcounts = np.array([init_count] * len(children))
        for i, ch in enumerate(children):
            try:
                # The data is not a leaf, or was initialized via `leaf_counts`.
                chcounts[i] = node_counts.pop(ch)
            except KeyError:
                pass

        logger.debug('children counts: %s',
                     '; '.join('%s:%s' % t for t in zip(children, chcounts)))

        pcount = init_count.copy()

        try:
            for k in range(maxn + 1):
                pcount[k] = combinations_of_2_children_summing_to_k(
                    chcounts, k)
        except ValueError:
            #err.args = ("Not implemented for non dichotomic trees."\
            #            " (node %s)" % parent,)
            logger.info("Non dichotomic fork.")
            for k in range(maxn + 1):
                pcount[k] = combinations_of_children_summing_to_k(chcounts, k)

        logger.debug("Counts at %r: %s", parent, pcount)
        # Proba that 1 event occurs on the *leading* branch:
        # Allowed only if zero events are allowed below!!
        if chcounts[:, 0].all():
            pcount[1] += 1

        node_counts[parent] = pcount

    root_counts = node_counts.setdefault(root, init_count)
    root_counts[1] -= 1  # Consider the root has no leading branch.
    return root_counts
Esempio n. 6
0
def tree_detach_toolong(tree, maxdist=MAXDIST):
    initial_Nleaves = len(list(iter_leaves(tree, get_children)))

    # Contains `node_counts` for the last visited children.
    # Recursively updated while moving from the leaves to the root.
    counts_by_child = {}

    #detached_leaves = set()
    leafset = set()
    detached_ids = set()

    new_leaves = set()
    included_new_leaves = set()

    detached_subtrees = []
    #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d",
    #             len(tree.info), len(set(tree.info)),
    #             len(tree.data), len(set(tree.data)),
    #             len(tree.info), len(tree.data), len(tree.info) - len(tree.data),
    #             len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)),
    #             len(list(iter_leaves(tree, get_children))))
    #logger.debug([n for n,info in tree.info.items() if (n not in tree.data and not info.get('gene_name'))])
    for node_dist, children_dists in rev_dfw_descendants(tree,
                                                         get_data,
                                                         include_leaves=True,
                                                         queue=[(tree.root, 0)
                                                                ]):
        # (detached, included, detached_leaves, leaves) #, detached_leafset_sizes
        # TODO: namedtuple
        node_counts = [0, 0, 0, 0, []]
        if not children_dists:
            # Add 1 to the leaf count.
            node_counts[3] += 1
            leafset.add(node_dist[0])

        else:
            newdata = []
            for child, dist in children_dists:
                child_counts = counts_by_child.pop(child)
                # Add the leaf count.
                node_counts[3] += child_counts[3]

                # Conditionally on this child being detached, update the counts.
                if dist >= maxdist or child in new_leaves:

                    node_counts[0] += 1  # directly detached descendants
                    node_counts[1] += sum(
                        child_counts[:2])  # detached included
                    node_counts[2] += child_counts[
                        3]  # child_leaves -> detached_leaves
                    node_counts[4].append(
                        child_counts[3])  # detached_leafset_size
                    detached_ids.add(child)

                    if child in new_leaves:
                        new_leaves.remove(child)
                        included_new_leaves.add(child)
                        if dist >= maxdist:
                            continue  # Do not yield this subtree, go to next child.

                    detached_data = {}
                    detached_info = {}
                    # This edits the tree inplace, but the outer iteration
                    # does not change because it is already precomputed
                    # (rev_dfw_ uses a list).
                    for (n, d), ndata in dfw_descendants_generalized(
                            tree,
                            get_data,
                            include_leaves=True,
                            queue=[(child, 0)]):
                        detached_info[n] = tree.info.pop(n)  #.pop
                        if ndata:
                            detached_data[n] = ndata

                    detached_subtrees.append(
                        myProteinTree.ProteinTree(detached_data, detached_info,
                                                  child))
                    ### TODO: add a 'tree_name' at the subtree.info[subtree.root]
                    ### TODO: add a new suffix to the family_name
                    ### (pour ne pas faire planter prune2family qui pourrait
                    ### sortir plusieurs subtrees avec le même nom).
                else:
                    node_counts[0] += child_counts[0]
                    node_counts[1] += child_counts[1]
                    node_counts[2] += child_counts[2]
                    node_counts[4].extend(child_counts[4])
                    newdata.append((child, dist))

            #if len(tree.data[node_dist[0]]) != len(newdata):
            #if any((x!=y) for x,y in zip(sorted(tree.data[node_dist[0]]),
            #                             sorted(newdata))):
            #import ipdb; ipdb.set_trace()
            if not newdata:
                logger.warning(
                    "All children detached at node %d from tree %d.",
                    node_dist[0], tree.root)
                new_leaves.add(node_dist[0])
                ### DO NOT output this tree if the root supports no extant species.

            tree.data[node_dist[0]] = newdata

        counts_by_child[node_dist[0]] = node_counts

    assert len(counts_by_child) == 1
    root_counts = counts_by_child[tree.root]

    #DEBUG

    #import ipdb; ipdb.set_trace()
    #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d",
    #         len(tree.info), len(set(tree.info)),
    #         len(tree.data), len(set(tree.data)),
    #         len(tree.info), len(tree.data), len(tree.info) - len(tree.data),
    #         len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)),
    #         len(list(iter_leaves(tree, get_children))))
    logger.debug("Tree %d: %s,\ndeleted ids: %s", tree.root, root_counts,
                 ' '.join(str(i) for i in detached_ids))

    root_Nleaves = len(list(iter_leaves(tree, get_children, [tree.root])))
    assert len(leafset) == initial_Nleaves, \
            "leafset %d != initial_Nleaves %d" %(len(leafset), initial_Nleaves)
    assert len(leafset) == root_Nleaves - len(new_leaves) + root_counts[2], \
            "Tree %d: leafset %d != %d root_Nleaves - %d new_leaves + %d detached_leaves" %\
                (tree.root, len(leafset), root_Nleaves, len(new_leaves), root_counts[2])

    assert initial_Nleaves == root_counts[3], "%d != %d, %s" % (
        initial_Nleaves, root_counts[3], root_counts)
    assert len(root_counts[4]) == root_counts[0]

    if tree.root in new_leaves:
        # We should not output this tree: mark it:
        logger.warning("The root %d does not support any tree.", tree.root)
        tree.root = None
    return root_counts, detached_subtrees
Esempio n. 7
0
def patristic_phyltree(phyltree, root=None, rootlength=0, point_loc=0):
    """Distances between every pair of branch.
    
    point_loc: fraction of branch length to slide the reference point:
        - 0 creates the patristic matrix between nodes;
        - 0.5 creates the patristic matrix between branch centers.
    """
    if root is None:
        root = phyltree.root
    if rootlength is None:
        rootlength = phyltree.rootlength if root == phyltree.root else phyltree.parent[
            root].distance

    branch_names = [
        ch for n, ch in dfw_pairs_generalized(phyltree,
                                              get_phylchildren,
                                              queue=[(None, root)],
                                              include_root=True)
    ]
    branch_indices = dict((br, i) for i, br in enumerate(branch_names))
    patristic_dists = np.zeros((len(branch_names), len(branch_names)))
    for (parent, dist), items in rev_dfw_descendants(phyltree,
                                                     get_phylitems,
                                                     queue=[(root, rootlength)
                                                            ]):
        #for ch, d in items:
        #    patristic_dists.loc[parent, ch] = dist/2. + d/2.
        p_i = branch_indices[parent]
        for (ch1, d1), (ch2, d2) in it.combinations(items, 2):
            ch1_i = branch_indices[ch1]
            ch2_i = branch_indices[ch2]
            if ch1_i < p_i:
                print('ch1 %s %d < parent %s %d' % (ch1, ch1_i, parent, p_i))
            for descendant1 in phyltree.allDescendants[ch1]:
                desc1_i = branch_indices[descendant1]
                patristic_dists[p_i,
                                desc1_i] = (dist * (point_loc) + d1 *
                                            (1 - point_loc) +
                                            patristic_dists[ch1_i, desc1_i])
                for descendant2 in phyltree.allDescendants[ch2]:
                    desc2_i = branch_indices[descendant2]
                    patristic_dists[p_i, desc2_i] = (
                        dist * point_loc + d2 * (1 - point_loc) +
                        patristic_dists[ch2_i, desc2_i])
                    if desc2_i < desc1_i:
                        print('desc2 %s %d < desc1 %s %d' %
                              (descendant2, desc2_i, descendant1, desc1_i))
                    patristic_dists[desc1_i, desc2_i] = (
                        d1 * (1 - point_loc) + d2 * (1 - point_loc) +
                        patristic_dists[ch1_i, desc1_i] +
                        patristic_dists[ch2_i, desc2_i])
                # Looks like this step could be done in one matrix product.
    #We filled only one half (older in rows, younger in columns).
    # Because rev_dfw iterates from young to old after visiting all descendants,
    # (young indices(column) > old indices(row)) it is the upper triangle.
    i_low, j_low = np.triu_indices(len(branch_names), k=1)
    patristic_dists[j_low, i_low] = patristic_dists[i_low, j_low]

    max_dist = patristic_dists.max()
    patristic_dists /= max_dist

    return patristic_dists, branch_names