Esempio n. 1
0
def delete_polyphylies(tree, conversion):
    for node in tree.traverse('postorder'):
        newname = conversion.get(node.name, node.name)
        if '+' in newname:
            # This clade is polyphyletic in the reference tree.
            # Transform the parent into a multifurcation.
            
            # Memorize the original distances to leaves, to check if it preserves lengths
            parent = node.up if node.up else node
            distleaves0 = sorted(iter_distleaves(parent, get_data),
                                 key=lambda dl: dl[0].name)

            # Propagate the deleted branch length to its children.
            for child in node.children:
                child.dist += node.dist
            node.delete(prevent_nondicotomic=False, preserve_branch_length=False)
            #TODO: check that newname.split('+') matches the node.children.
            logger.info("Delete node %r:%g (%s)", node.name, node.dist, newname)

            distleaves = sorted(iter_distleaves(parent, get_data),
                                key=lambda dl: dl[0].name)

            assert all((l0 == l) for (l0, dl0), (l, dl) in zip(distleaves0, distleaves)), \
                ('\n'.join('%15s\t%15s' % (l0.name, l.name)
                           for (l0,_), (l,_) in zip(distleaves0, distleaves)))
            assert all( ((dl0-dl) < 1e-5) for (l0, dl0), (l, dl) in zip(distleaves0, distleaves)),\
                ('Leaf distances were not preserved after deletion!!!\n'
                 'At node %r:%g\n' % (node.name, node.dist)
                 + '\n'.join('%15s:%g\t%15s:%g' % (l0.name, dl0, l.name, dl)
                             for (l0,dl0), (l,dl) in zip(distleaves0, distleaves)))
Esempio n. 2
0
def keep_closest_leaf(tree, node, indicator=False):
    leafdists = sorted(iter_distleaves(tree, get_data, root=node),
                       key=lambda datum: datum[1])
    # This does not keep intermediate nodes.
    tree.data[node] = [leafdists[0]]
    # WARNING: the info of deleted nodes is kept, so don't rely on it!!!
    if indicator:
        tree.info[node]['Duplication'] = 10
Esempio n. 3
0
def main(inputtree, format=1, to_table=False):
    tree = ete3.Tree(inputtree, format=format)
    assert is_ultrametric(tree)
    
    for node in tree.traverse('postorder'):
        if not getattr(node, 'age', None):
            if node.children:
                age = float(sum(d for l,d in iter_distleaves(node,
                                                get_items))) / len(node)

            else:
                age = 0
            if to_table:
                print('%s\t%s' % (node.name, age))
            else:
                node.add_feature('age', age)

    if not to_table:
        print(tree.write(features=['age'], format=format,
              format_root_node=True))
Esempio n. 4
0
def collapse_clades(tree, get_items, set_items, clades, make_new_clade=None):
    """Note: this modifies tree **inplace**. Make a copy accordingly.
    
    make_new_clade: function to create the new node. Takes (clade, cladesize) as argument.
    By default, prefix the number of leaves to the clade name.
    Useful for PhylTree which does not support duplicated node names.

    Handles nested clades by collapsing the most basal.
    """
    if make_new_clade is None:

        def make_new_clade(clade, cladesize):
            if isinstance(clade, str):
                return '%d %s' % (cladesize, clade)
            elif hasattr(clade, 'name'):
                new = copy(clade)
                setattr(new, 'name', '%d %s' % (cladesize, clade.name))
                return new

    leaf_numbers = [0] * len(clades)
    # Iterate from root to leaves
    iter_tree = list(
        dfw_pairs_generalized(tree,
                              get_items,
                              queue=[((None, 0), (tree.root, 0))]))
    for _, (clade, _) in iter_tree:
        #iter_tree = list(dfw_descendants_generalized(tree, get_items, queue=[(tree.root, 0)], copy=True))
        #for _, items in iter_tree:
        #    for clade, _ in items:
        try:
            clade_i = clades.index(clade)
        except ValueError:
            continue
        leafdists = list(iter_distleaves(tree, get_items, root=clade))
        leaf_numbers[clade_i] = len(leafdists)
        maxdist = max(d for l, d in leafdists)
        #TODO: add the mindist information -> draw non ultrametric triangles
        set_items(tree, (clade, None), [(make_new_clade(
            clade, len(leafdists)), maxdist)])  # (clade, mindist)
        #TODO: some cleanup is needed in PhylTree (parents, dicLinks, ages)
    return leaf_numbers
Esempio n. 5
0
def fuse_subspecies(forest, species2seq, delete_distant_orthologs=False):
    n_fused = 0
    n_2ch = 0
    n_1ch = 0
    n_single = 0  # Sequence from the given redundant set, without ortholog in the tree.
    n_separated = 0  # Do not share a MRCA in the given species with its apparent orthologs.

    for tree in forest:

        info = tree.info
        data = tree.data
        # Backup 'tree_name' in case the root is deleted
        #tree_name = tree.info[tree.root]['tree_name']
        
        kept_children = set()  # Check variable.
        removed_children = set()
        edited_parents = set()
        for (parent,dist), childrendists in dfw_descendants_generalized(tree, get_data,
                                                queue=[(tree.root, 0)]):
            if info[parent]['Duplication'] != 0:
                continue

            parent_taxon = info[parent]['taxon_name']
            if parent_taxon in species2seq:
                
                assert 'gene_name' not in info[parent]
                data.pop(parent)
                assert len(childrendists) <= len(species2seq[parent_taxon]), \
                        "Too many descendant sequences at node %d (%s)" \
                        % (parent, parent_taxon)
                if len(childrendists) > 1:
                    # Test each sequence start, and quit at first match.
                    for seqstart in species2seq[parent_taxon]:
                        for ch_i, (ch, chdist) in enumerate(childrendists):
                            gene_names, gene_dists = zip(*(
                                (info[tip]['gene_name'], gdist)
                                for tip, gdist in iter_distleaves(tree, get_data,
                                                                  root=ch)
                                          ))

                            if len(gene_names) > 1:
                                assert info[ch]['Duplication'] != 0
                            
                            if gene_names[0].startswith(seqstart):
                                # No inner speciation should be possible.
                                assert all(gn.startswith(seqstart) for gn in gene_names)
                                break
                        else:
                            raise ValueError("%s not matched by any of %s" % \
                                             (gene_names, species2seq[parent_taxon]))
                    
                    #except KeyError as err:
                    #    err.args += ("at %s" % [ch for ch,_ in childrendists],)
                    #    raise
                    
                    kept_ch, kept_dist = ch, chdist
                    removed_ch = [c for c,_ in (childrendists[:ch_i]
                                                + childrendists[(i+1):])]
                    for rc in removed_ch:
                        info.pop(rc)
                        rdat = data.pop(rc, None)
                        if rdat:
                            logger.warning('Removed child %d had descendants: %s',
                                           rc, rdat)
                    removed_children.update(removed_ch)
                    n_2ch += 1

                else:
                    kept_ch, kept_dist = childrendists[0]
                    n_1ch += 1

                if kept_dist > 0:
                    logger.warning("%d %r sequence distance > 0!",
                                   kept_ch, info[kept_ch].get('gene_name'))
                #if 850 in removed_children:
                #    import ipdb; ipdb.set_trace()

                # Replace with the correct child.
                info[parent].update(info.pop(kept_ch))  # So that 'tree_name' or 'Bootstrap' fields are conserved.
                
                try:
                    data[parent] = data.pop(kept_ch)
                except KeyError as err:
                    #if err.args[0] != kept_ch:
                    #    err.args += ('parent = %s ; kept_ch = %s' % (parent, kept_ch),)
                    #    raise
                    pass
                #info[parent].update(taxon_name=parent_taxon)
                
                kept_children.add(kept_ch)
                edited_parents.add(parent)
                n_fused += 1
            else:
                for i, (ch,_) in enumerate(childrendists):
                    for sp, spseqs in species2seq.items():
                        for j, seq in enumerate(spseqs):
                            if info[ch].get('gene_name', '').startswith(seq):
                                logger.warning("Unexpected parent: %d (%s) at leaf %d %s",
                                               parent, parent_taxon,
                                               ch, info[ch]['gene_name'])
                                break
                        else:
                            continue
                                               
                        # If match, Need to check that there is no sister
                        # sequence in the other child
                        sister_childrendists = childrendists[:i] + childrendists[(i+1):]
                        sister_seqs = spseqs[:j] + spseqs[(j+1):]
                        has_orthologs = 0
                        for sister_ch,_ in sister_childrendists:
                            sister_matched = [info[tip]['gene_name']
                                              for tip in iter_leaves(tree,
                                                    get_children, [sister_ch])
                                              for sseq in sister_seqs
                                              if info[tip]['gene_name'].startswith(sseq)]
                            if sister_matched:
                                has_orthologs += len(sister_matched)
                                logger.warning('Potential subspecies orthologs found: '
                                               '%s', sister_matched)
                        
                        # But exclude those if we find paralogs
                        paralogs = [info[tip]['gene_name'] for tip in
                                        iter_leaves(tree, get_children, [ch])
                                    if info[tip]['gene_name'].startswith(seq)]
                        
                        if has_orthologs and not paralogs:
                            n_separated += 1
                            # Comment: there was none of those in Ensembl 93.
                        else:
                            n_single += 1
        assert not removed_children.intersection(info)
        assert not removed_children.intersection(data)
        assert not kept_children.intersection(info)
        assert not kept_children.intersection(data)
        assert len(edited_parents) == len(edited_parents.intersection(info))
        
        yield tree

    logger.info("\n%d fusions (%d from >2 sequences, %d from 1 sequence).\n"
                "%d singles (only one of the two subspecies was found)\n"
                "%d separated (1 or more distant orthologs in the sister "
                "subspecies were found)",
                n_fused, n_2ch, n_1ch, n_single, n_separated)
Esempio n. 6
0
 def is_ultrametric(tree, thresh=0.01):
     leaf_dists = [d for _,d in iter_distleaves(tree, get_data)]
     return max(leaf_dists) - min(leaf_dists) < thresh
Esempio n. 7
0
def main(infile, subtreesfile=None, check_ultrametricity=-1,
         format=1, quoted_node_names=False):
    if check_ultrametricity < 0:
        is_ultrametric = lambda tree, thresh: True
    else:
        from dendro.bates import iter_distleaves
        get_data = lambda tree,dat: [(ch, ch.dist) for ch in dat[0].children]
        def is_ultrametric(tree, thresh=0.01):
            leaf_dists = [d for _,d in iter_distleaves(tree, get_data)]
            return max(leaf_dists) - min(leaf_dists) < thresh

    tree = ete3.Tree(infile, format=format)
    if not is_ultrametric(tree, check_ultrametricity):
        logger.warning("Requested ultrametricity check but the input tree is not ultrametric.")

    stream = stdin if subtreesfile is None else open(subtreesfile)
    subtrees = [s + ';' for s in stream.read().rstrip('; \t\n').split(';')]
    if subtreesfile is None:
        stream.close()

    for subtreetxt in subtrees:
        newsubtree = ete3.Tree(subtreetxt, format=format, quoted_node_names=quoted_node_names)
        logger.info("Inserting " + newsubtree.name)
        # First leaf of newsubtree should be an existing node in the main tree.
        anchor_node = newsubtree.get_leaves()[0]
        assert anchor_node == newsubtree.children[0], \
            "Incorrect subtree specification: anchor_node should be a leaf. %r VS\n%s" % \
                (anchor_node.name, newsubtree.get_ascii())
        try:
            orig_node = tree.search_nodes(name=anchor_node.name)[0]
        except IndexError:
            raise LookupError('Node %r not found in the source tree' % anchor_node.name)

        if check_ultrametricity > 0:
            age_anchor = orig_node.get_farthest_leaf()[1]
            age_via_anchor = age_anchor + anchor_node.dist
            inserted_leafdists = [ch.dist + d
                                  for ch in newsubtree.children[1:]
                                  for _,d in iter_distleaves(ch, get_data)]
            if max(inserted_leafdists) - min(inserted_leafdists) >= check_ultrametricity:
                logger.error("Requested ultrametricity but an input subtree is not ultrametric.")
            age_via_inserted = max(inserted_leafdists)
            assert age_via_anchor - age_via_inserted < check_ultrametricity, \
                "The combination of inserted leaf ages and anchor age cannot"\
                " be made ultrametric, at %r:\n" % newsubtree.name \
                + "%r(age = %g):dist=%g\n" % (anchor_node.name, age_anchor, anchor_node.dist) \
                + "\n".join("%r(age = %g):dist=%g" % (ch.name, age_via_inserted - ch.dist, ch.dist)
                            for ch in newsubtree.children[1:])
                        
        parent = orig_node.up
        orig_dist = orig_node.dist
        anchor_dist = anchor_node.dist
        inserted_dists_diffs = [nch.dist - anchor_dist for nch in newsubtree.children]

        orig_node.detach()
        new_dist = orig_dist - anchor_dist

        parent.add_child(child=newsubtree, dist=new_dist)
        parent.swap_children()

        #for ref_child in orig_node.children:
        #    anchor_node.add_child(child=ref_child)
        #Plus all other features

        anchor_node.detach()
        newsubtree.add_child(child=orig_node, dist=anchor_dist)
        newsubtree.swap_children()

        if new_dist < 0:
            logger.warning("New branch to %r longer than the original (%g > %g), fixing.",
                           anchor_node.name, anchor_dist, orig_dist)
            ## ALL CHILDREN of newsubtree must be shortened!
            nfixes = fix_negative_lengths(newsubtree)
            logger.info("Fixed %d branch(es).", nfixes)

        assert newsubtree.dist + orig_node.dist == orig_dist,\
                "%r:%g + %r:%g != %g" % (newsubtree.name, newsubtree.dist,
                                         orig_node.name, orig_node.dist,
                                         orig_dist)
        assert all((nch.dist - orig_node.dist - inserted_diff < 1e-12)
                   for nch, inserted_diff \
                   in zip(newsubtree.children, inserted_dists_diffs)), \
                   ("Changed children dists differences, at %s.\n" % newsubtree.name
                    + "\n".join("%s: %s -> %s" % (nch.name,
                                                  nch.dist - orig_node.dist,
                                                  inserted_diff)
                                for nch, inserted_diff in
                                zip(newsubtree.children, inserted_dists_diffs)))
        assert is_ultrametric(parent, check_ultrametricity)

    assert is_ultrametric(tree, check_ultrametricity)
    logger.debug("Ultrametric = %s:\ndist to %s = %s;\ndist to %s = %s",
                 is_ultrametric(tree, check_ultrametricity),
                 *(tree.get_closest_leaf() + tree.get_farthest_leaf()))

    print(tree.write(format=format, format_root_node=True))