コード例 #1
0
def select_less_leaves(nodedata, tree):  #keep=1
    """Return the child node whose descendant species are the least numerous."""
    # Select based on the number of species in the leaves
    species_counts = []
    for ch,_ in nodedata:
        child_leaves = list(iter_leaves(tree,
                                lambda tr,n: [c for c,_ in tr.data.get(n, [])],
                                queue=[ch]))
        child_species = set(tree.info[leaf]['taxon_name'] for leaf in child_leaves)
        species_counts.append(len(child_species))

    selected, least_species = min(enumerate(species_counts), key=lambda x: x[1])
    return nodedata[selected][0], selected
コード例 #2
0
def detach_toolongbranch(tree, node, maxdist=MAXDIST):
    n_detached = 0
    n_leaves_detached = []

    newdata = []
    for ch, dist in tree.data.get(node, []):
        if dist < maxdist:
            newdata.append((ch, dist))
        else:
            n_detached += 1
            n_leaves_detached.append(
                len(list(iter_leaves(tree, get_children, [child]))))
    tree.data[node] = newdata
    return n_detached, n_leaves_detached
コード例 #3
0
def prottree_extract_genecounts(proteintrees,
                                ancestor,
                                phyltree,
                                speciesset=set(('H**o sapiens', )),
                                keeponly=None):
    """Walk each tree to find the ancestor taxon, then output the gene counts
    per descendant species, as well as the list of human genes.

    Param: `keeponly` should be None, "stem" or "crown".
    """

    # for myProteinTree class
    def get_children(tree, node):
        return [c for c, _ in tree.data.get(node, [])]

    ancestor_ancgenes = []
    ancestor_genecounts = []  # In each species
    ancestor_spgenes = []  # For genes from speciesset.
    ancestors = []  # Ancestors at which we got the node.

    clades_before_ancestor = set(
        phyltree.dicLinks[phyltree.root][ancestor])  # includes anc
    clades_after_ancestor = phyltree.allDescendants[ancestor]  # includes anc
    clades_outside_ancestor = (
        phyltree.outgroupSpecies[ancestor]
        | (phyltree.getTargetsAnc("/" + ancestor) - clades_before_ancestor))

    for tree in proteintrees:
        info = tree.info
        #if tree.root == 16401:
        #    import ipdb; ipdb.set_trace()

        if info[tree.root]['taxon_name'] in clades_before_ancestor:
            for parent, node in dfw_pairs_generalized(tree,
                                                      get_children,
                                                      include_root=True):
                taxon_node = info[node]['taxon_name']
                if parent is not None:
                    taxon_parent = info[parent]['taxon_name']
                    isdup = (info[parent]['Duplication'] > 1)
                else:
                    # then the child node (i.e the root) should be kept if it
                    # is exactly equal to ancestor.
                    taxon_parent = taxon_node  # We know it's before (or equal).
                    isdup = False

                if taxon_parent in clades_before_ancestor and \
                        taxon_node in clades_after_ancestor:

                    if taxon_node != ancestor and taxon_parent != ancestor:
                        # The branch "jumps" over this ancestral species
                        # Process the most basal node. WHY? NO!
                        #node = parent
                        #taxon_node = taxon_parent
                        pass
                    elif taxon_node != ancestor and not isdup:
                        # The parent is an 'ancestor' speciation so this node should already
                        # have been taken into account.
                        # Except if the parent is the root.
                        #assert info[parent]['family_name'] in ancestor_ancgenes,\
                        #    "At %d->%d: %s %s ->..." % (parent, node,
                        #                            taxon_parent,
                        #                            info[parent]['family_name'])
                        #
                        continue

                    if keeponly == 'crown' and info[node]['Duplication'] > 1:
                        # node is at the ancestor.
                        # We don't want to keep it if it is a duplication.
                        continue

                    nodename = info[node]['family_name']
                    #assert nodename not in ancestor_ancgenes
                    ancestor_ancgenes.append(nodename)
                    ancestors.append((taxon_parent, taxon_node))

                    spgenes = {sp: [] for sp in speciesset}
                    gene_counts = defaultdict(int)
                    for leaf in iter_leaves(tree, get_children, queue=[node]):
                        taxon_leaf = phyltree.officialName[info[leaf]
                                                           ['taxon_name']]
                        if taxon_leaf not in phyltree.listSpecies:
                            # Error because of tree.data.pop
                            #import ipdb; ipdb.set_trace()
                            errmsg = "%d '%s' is not a species! (tree %d, node %d)"\
                                          % (leaf, taxon_leaf, tree.root, node)
                            if taxon_leaf in phyltree.allNames:
                                if keeponly != 'stem':
                                    logger.error(errmsg)
                                continue
                            else:
                                raise RuntimeError(errmsg)
                        gene_counts[taxon_leaf] += 1
                        #if taxon_leaf == 'H**o sapiens':
                        #    spgenes.append(info[leaf]['gene_name'])
                        if taxon_leaf in speciesset:
                            spgenes[taxon_leaf].append(info[leaf]['gene_name'])

                    ancestor_genecounts.append(gene_counts)
                    #ancestor_spgenes.append(tuple(spgenes))  # tuple: Important for finding `()` in Series.
                    ancestor_spgenes.append(
                        {sp: tuple(genes)
                         for sp, genes in spgenes.items()})

                    # Now do we wan't to score descendant ancestor nodes?
                    if keeponly == 'stem':
                        try:
                            tree.data.pop(node)
                        except KeyError:
                            # This node is not in data because it is a leaf:
                            assert node in tree.info
                elif taxon_node not in clades_before_ancestor:
                    # Avoid visiting outgroups and strict ingroups.
                    try:
                        # I don't know how, but this messes up the dfw iteration.
                        tree.data.pop(node)
                        logger.debug("pop data of %d (%s)", node, taxon_node)
                    except KeyError:
                        #assert 'gene_name' in info[node]
                        logger.debug("Ignore data of %d (%s)", node,
                                     taxon_node)
                        pass  # It's a leaf.

    assert len(ancestor_ancgenes) == len(ancestors)
    assert len(ancestor_ancgenes) == len(ancestor_spgenes)
    assert len(ancestor_ancgenes) == len(ancestor_genecounts)

    return ancestors, ancestor_ancgenes, ancestor_genecounts, ancestor_spgenes
コード例 #4
0
def tree_detach_toolong(tree, maxdist=MAXDIST):
    initial_Nleaves = len(list(iter_leaves(tree, get_children)))

    # Contains `node_counts` for the last visited children.
    # Recursively updated while moving from the leaves to the root.
    counts_by_child = {}

    #detached_leaves = set()
    leafset = set()
    detached_ids = set()

    new_leaves = set()
    included_new_leaves = set()

    detached_subtrees = []
    #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d",
    #             len(tree.info), len(set(tree.info)),
    #             len(tree.data), len(set(tree.data)),
    #             len(tree.info), len(tree.data), len(tree.info) - len(tree.data),
    #             len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)),
    #             len(list(iter_leaves(tree, get_children))))
    #logger.debug([n for n,info in tree.info.items() if (n not in tree.data and not info.get('gene_name'))])
    for node_dist, children_dists in rev_dfw_descendants(tree,
                                                         get_data,
                                                         include_leaves=True,
                                                         queue=[(tree.root, 0)
                                                                ]):
        # (detached, included, detached_leaves, leaves) #, detached_leafset_sizes
        # TODO: namedtuple
        node_counts = [0, 0, 0, 0, []]
        if not children_dists:
            # Add 1 to the leaf count.
            node_counts[3] += 1
            leafset.add(node_dist[0])

        else:
            newdata = []
            for child, dist in children_dists:
                child_counts = counts_by_child.pop(child)
                # Add the leaf count.
                node_counts[3] += child_counts[3]

                # Conditionally on this child being detached, update the counts.
                if dist >= maxdist or child in new_leaves:

                    node_counts[0] += 1  # directly detached descendants
                    node_counts[1] += sum(
                        child_counts[:2])  # detached included
                    node_counts[2] += child_counts[
                        3]  # child_leaves -> detached_leaves
                    node_counts[4].append(
                        child_counts[3])  # detached_leafset_size
                    detached_ids.add(child)

                    if child in new_leaves:
                        new_leaves.remove(child)
                        included_new_leaves.add(child)
                        if dist >= maxdist:
                            continue  # Do not yield this subtree, go to next child.

                    detached_data = {}
                    detached_info = {}
                    # This edits the tree inplace, but the outer iteration
                    # does not change because it is already precomputed
                    # (rev_dfw_ uses a list).
                    for (n, d), ndata in dfw_descendants_generalized(
                            tree,
                            get_data,
                            include_leaves=True,
                            queue=[(child, 0)]):
                        detached_info[n] = tree.info.pop(n)  #.pop
                        if ndata:
                            detached_data[n] = ndata

                    detached_subtrees.append(
                        myProteinTree.ProteinTree(detached_data, detached_info,
                                                  child))
                    ### TODO: add a 'tree_name' at the subtree.info[subtree.root]
                    ### TODO: add a new suffix to the family_name
                    ### (pour ne pas faire planter prune2family qui pourrait
                    ### sortir plusieurs subtrees avec le même nom).
                else:
                    node_counts[0] += child_counts[0]
                    node_counts[1] += child_counts[1]
                    node_counts[2] += child_counts[2]
                    node_counts[4].extend(child_counts[4])
                    newdata.append((child, dist))

            #if len(tree.data[node_dist[0]]) != len(newdata):
            #if any((x!=y) for x,y in zip(sorted(tree.data[node_dist[0]]),
            #                             sorted(newdata))):
            #import ipdb; ipdb.set_trace()
            if not newdata:
                logger.warning(
                    "All children detached at node %d from tree %d.",
                    node_dist[0], tree.root)
                new_leaves.add(node_dist[0])
                ### DO NOT output this tree if the root supports no extant species.

            tree.data[node_dist[0]] = newdata

        counts_by_child[node_dist[0]] = node_counts

    assert len(counts_by_child) == 1
    root_counts = counts_by_child[tree.root]

    #DEBUG

    #import ipdb; ipdb.set_trace()
    #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d",
    #         len(tree.info), len(set(tree.info)),
    #         len(tree.data), len(set(tree.data)),
    #         len(tree.info), len(tree.data), len(tree.info) - len(tree.data),
    #         len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)),
    #         len(list(iter_leaves(tree, get_children))))
    logger.debug("Tree %d: %s,\ndeleted ids: %s", tree.root, root_counts,
                 ' '.join(str(i) for i in detached_ids))

    root_Nleaves = len(list(iter_leaves(tree, get_children, [tree.root])))
    assert len(leafset) == initial_Nleaves, \
            "leafset %d != initial_Nleaves %d" %(len(leafset), initial_Nleaves)
    assert len(leafset) == root_Nleaves - len(new_leaves) + root_counts[2], \
            "Tree %d: leafset %d != %d root_Nleaves - %d new_leaves + %d detached_leaves" %\
                (tree.root, len(leafset), root_Nleaves, len(new_leaves), root_counts[2])

    assert initial_Nleaves == root_counts[3], "%d != %d, %s" % (
        initial_Nleaves, root_counts[3], root_counts)
    assert len(root_counts[4]) == root_counts[0]

    if tree.root in new_leaves:
        # We should not output this tree: mark it:
        logger.warning("The root %d does not support any tree.", tree.root)
        tree.root = None
    return root_counts, detached_subtrees
コード例 #5
0
def edit_from_selection(proteintrees, badnodes):
    """Keep only one child, the one leading to the closest leaves."""

    n_edits = 0
    n_included = 0  # Number of nodes not edited, because already in a larger
    # edited subtree
    n_morethan2 = 0  # Number of nodes with >2 leaves

    for tree in proteintrees:
        tree_badnodes = badnodes.intersection(tree.data)
        if not tree_badnodes:
            logger.debug("No nodes to remove.")
            yield False, tree
            continue

        if tree.root in tree_badnodes:
            logger.error("Root node %d listed as a node to remove!!!",
                         tree.root)
            raise NotImplementedError("Root node %d marked for removal: "
                                      "don't know what to do." % tree.root)
        # First the bad nodes must be sorted according to their rootwardness.
        # (in order to edit only the most basal in case of inclusion, and
        # ignore the included ones)
        badnode_leaves = [(badnode,
                           set(iter_leaves(tree, get_children, [badnode])))
                          for badnode in tree_badnodes]

        edited_leafsets = []
        # Indices in the above list where the leafset size decreases
        larger_size_i = 0
        # Current size
        size = len(badnode_leaves[0][1])

        for badnode, leaves in sorted(badnode_leaves,
                                      key=lambda x: len(x[1]),
                                      reverse=True):

            # Check leaf number: then only check intersection with strictly larger sets.
            new_size = len(leaves)
            if new_size < size:
                #logger.debug("Size decrease!")
                larger_size_i = len(edited_leafsets)
                size = new_size

            # Edit only if no ancestral node has already been edited.
            if not any((leaves & edited_l)
                       for edited_l in edited_leafsets[:larger_size_i]):
                edited_leafsets.append(leaves)
                keep_closest_leaf(tree, badnode, indicator=True)
                n_edits += 1
            else:
                n_included += 1

        assert size == 2
        n_morethan2 += larger_size_i

        badnodes.difference_update(tree_badnodes)
        logger.debug(
            "Tree: %d: %d edited nodes; %d implicitely edited; %d with >2 leaves",
            tree.root, n_edits, n_included, n_morethan2)
        yield True, tree

    if badnodes:
        logger.warning("%d nodes not found: %s", len(badnodes),
                       ' '.join(str(n) for n in badnodes))
    logger.info(
        "\n  %9d edited nodes\n"
        "  +%8d implicitely edited\n"
        "  %9d with >2 leaves", n_edits, n_included, n_morethan2)
コード例 #6
0
def fuse_subspecies(forest, species2seq, delete_distant_orthologs=False):
    n_fused = 0
    n_2ch = 0
    n_1ch = 0
    n_single = 0  # Sequence from the given redundant set, without ortholog in the tree.
    n_separated = 0  # Do not share a MRCA in the given species with its apparent orthologs.

    for tree in forest:

        info = tree.info
        data = tree.data
        # Backup 'tree_name' in case the root is deleted
        #tree_name = tree.info[tree.root]['tree_name']
        
        kept_children = set()  # Check variable.
        removed_children = set()
        edited_parents = set()
        for (parent,dist), childrendists in dfw_descendants_generalized(tree, get_data,
                                                queue=[(tree.root, 0)]):
            if info[parent]['Duplication'] != 0:
                continue

            parent_taxon = info[parent]['taxon_name']
            if parent_taxon in species2seq:
                
                assert 'gene_name' not in info[parent]
                data.pop(parent)
                assert len(childrendists) <= len(species2seq[parent_taxon]), \
                        "Too many descendant sequences at node %d (%s)" \
                        % (parent, parent_taxon)
                if len(childrendists) > 1:
                    # Test each sequence start, and quit at first match.
                    for seqstart in species2seq[parent_taxon]:
                        for ch_i, (ch, chdist) in enumerate(childrendists):
                            gene_names, gene_dists = zip(*(
                                (info[tip]['gene_name'], gdist)
                                for tip, gdist in iter_distleaves(tree, get_data,
                                                                  root=ch)
                                          ))

                            if len(gene_names) > 1:
                                assert info[ch]['Duplication'] != 0
                            
                            if gene_names[0].startswith(seqstart):
                                # No inner speciation should be possible.
                                assert all(gn.startswith(seqstart) for gn in gene_names)
                                break
                        else:
                            raise ValueError("%s not matched by any of %s" % \
                                             (gene_names, species2seq[parent_taxon]))
                    
                    #except KeyError as err:
                    #    err.args += ("at %s" % [ch for ch,_ in childrendists],)
                    #    raise
                    
                    kept_ch, kept_dist = ch, chdist
                    removed_ch = [c for c,_ in (childrendists[:ch_i]
                                                + childrendists[(i+1):])]
                    for rc in removed_ch:
                        info.pop(rc)
                        rdat = data.pop(rc, None)
                        if rdat:
                            logger.warning('Removed child %d had descendants: %s',
                                           rc, rdat)
                    removed_children.update(removed_ch)
                    n_2ch += 1

                else:
                    kept_ch, kept_dist = childrendists[0]
                    n_1ch += 1

                if kept_dist > 0:
                    logger.warning("%d %r sequence distance > 0!",
                                   kept_ch, info[kept_ch].get('gene_name'))
                #if 850 in removed_children:
                #    import ipdb; ipdb.set_trace()

                # Replace with the correct child.
                info[parent].update(info.pop(kept_ch))  # So that 'tree_name' or 'Bootstrap' fields are conserved.
                
                try:
                    data[parent] = data.pop(kept_ch)
                except KeyError as err:
                    #if err.args[0] != kept_ch:
                    #    err.args += ('parent = %s ; kept_ch = %s' % (parent, kept_ch),)
                    #    raise
                    pass
                #info[parent].update(taxon_name=parent_taxon)
                
                kept_children.add(kept_ch)
                edited_parents.add(parent)
                n_fused += 1
            else:
                for i, (ch,_) in enumerate(childrendists):
                    for sp, spseqs in species2seq.items():
                        for j, seq in enumerate(spseqs):
                            if info[ch].get('gene_name', '').startswith(seq):
                                logger.warning("Unexpected parent: %d (%s) at leaf %d %s",
                                               parent, parent_taxon,
                                               ch, info[ch]['gene_name'])
                                break
                        else:
                            continue
                                               
                        # If match, Need to check that there is no sister
                        # sequence in the other child
                        sister_childrendists = childrendists[:i] + childrendists[(i+1):]
                        sister_seqs = spseqs[:j] + spseqs[(j+1):]
                        has_orthologs = 0
                        for sister_ch,_ in sister_childrendists:
                            sister_matched = [info[tip]['gene_name']
                                              for tip in iter_leaves(tree,
                                                    get_children, [sister_ch])
                                              for sseq in sister_seqs
                                              if info[tip]['gene_name'].startswith(sseq)]
                            if sister_matched:
                                has_orthologs += len(sister_matched)
                                logger.warning('Potential subspecies orthologs found: '
                                               '%s', sister_matched)
                        
                        # But exclude those if we find paralogs
                        paralogs = [info[tip]['gene_name'] for tip in
                                        iter_leaves(tree, get_children, [ch])
                                    if info[tip]['gene_name'].startswith(seq)]
                        
                        if has_orthologs and not paralogs:
                            n_separated += 1
                            # Comment: there was none of those in Ensembl 93.
                        else:
                            n_single += 1
        assert not removed_children.intersection(info)
        assert not removed_children.intersection(data)
        assert not kept_children.intersection(info)
        assert not kept_children.intersection(data)
        assert len(edited_parents) == len(edited_parents.intersection(info))
        
        yield tree

    logger.info("\n%d fusions (%d from >2 sequences, %d from 1 sequence).\n"
                "%d singles (only one of the two subspecies was found)\n"
                "%d separated (1 or more distant orthologs in the sister "
                "subspecies were found)",
                n_fused, n_2ch, n_1ch, n_single, n_separated)