Example #1
0
def reroot(history_path, tree_path):
    history = Tree(history_path, format=1)
    tree = Tree(tree_path)
    if len(tree.get_children()) > 2:
        print("original tree is not rooted, no need to fix history")
        return
    missing_length = size(tree) - size(history)
    tree_children = tree.get_children()
    if abs(tree_children[0].dist-missing_length) < abs(tree_children[1].dist-missing_length):
        missing_child = tree_children[0]
    else:
        missing_child = tree_children[1]
    grandchildren_dists = [child.dist for child in missing_child.get_children()]
    children_to_detach = []
    labels = []
    for child in history.get_children():
        if child.dist in grandchildren_dists:
            if "{0}" in child.name:
                labels.append(0)
            else:
                labels.append(1)
            children_to_detach.append(child.detach())
    remaining_child = history.get_children()[-1]
    if "{0}" in remaining_child.name:
        labels.append(0)
    else:
        labels.append(1)
    if np.sum(labels) < 2:
        chosen_label = 0
    else:
        chosen_label = 1
    new_child = history.get_tree_root().add_child(name="missing_node{" + str(chosen_label) + "}", dist=missing_length)
    for child in children_to_detach:
        new_child.add_child(child)
    # make sure that now the tree and the history are of the same length
    if abs(size(history)-size(tree)) > 0.00001:
        print("Error! failed to fix history tree")
        print("size(history) = ", size(history) , "\n size(tree) = ", size(tree))
        exit (1)
    # make sure that all the lengths were written to tree string
    history_str = history.write(outfile=None, format=1)
    new_history = Tree(history_str, format=1)
    if abs(size(new_history)-size(tree)) > 0.00001:
        print("Error! failed to fix history tree newick format")
        print("size(history) = ", size(history) , "\n size(tree) = ", np.sum(bls))
        exit (1)
    # if all went well, write the fixed history to its original file
    history.write(outfile=history_path, format=1)
def print_random_tree(num_nodes=5):
    """
    Doc Doc Doc
    """

    t = Tree()
    t.populate(num_nodes)

    print("t", t)
    print("children", t.children)
    print("get_children", t.get_children())
    print("up", t.up)
    print("name", t.name)
    print("dist", t.dist)
    print("is_leaf", t.is_leaf())
    print("get_tree_root", t.get_tree_root())
    print("children[0].get_tree_root", t.children[0].get_tree_root())
    print("children[0].children[0].get_tree_root",
          t.children[0].children[0].get_tree_root())
    for leaf in t:
        print(leaf.name)
Example #3
0
class exponential_mixture:
    """ML search PTP, to use: __init__(), search() and count_species()"""
    def __init__(
        self,
        tree,
        sp_rate=0,
        fix_sp_rate=False,
        max_iters=20000,
        min_br=0.0001,
    ):
        self.min_brl = min_br
        self.tree = Tree(tree, format=1)
        self.tree.resolve_polytomy(recursive=True)
        self.tree.dist = 0.0
        self.fix_spe_rate = fix_sp_rate
        self.fix_spe = sp_rate
        self.max_logl = float("-inf")
        self.max_setting = None
        self.null_logl = 0.0
        self.null_model()
        self.species_list = None
        self.counter = 0
        self.setting_set = set([])
        self.max_num_search = max_iters

    def null_model(self):
        coa_br = []
        all_nodes = self.tree.get_descendants()
        for node in all_nodes:
            if node.dist > self.min_brl:
                coa_br.append(node.dist)
        e1 = exp_distribution(coa_br)
        self.null_logl = e1.sum_log_l()
        return e1.rate

    def __compare_node(self, node):
        return node.dist

    def re_rooting(self):
        node_list = self.tree.get_descendants()
        node_list.sort(key=self.__compare_node)
        node_list.reverse()
        rootnode = node_list[0]
        self.tree.set_outgroup(rootnode)
        self.tree.dist = 0.0

    def comp_num_comb(self):
        for node in self.tree.traverse(strategy="postorder"):
            if node.is_leaf():
                node.add_feature("cnt", 1.0)
            else:
                acum = 1.0
                for child in node.get_children():
                    acum = acum * child.cnt
                acum = acum + 1.0
                node.add_feature("cnt", acum)
        return self.tree.cnt

    def next(self, sp_setting):
        self.setting_set.add(frozenset(sp_setting.spe_nodes))
        logl = sp_setting.get_log_l()
        if logl > self.max_logl:
            self.max_logl = logl
            self.max_setting = sp_setting
        for node in sp_setting.active_nodes:
            if node.is_leaf():
                pass
            else:
                childs = node.get_children()
                sp_nodes = []
                for child in childs:
                    sp_nodes.append(child)
                for nod in sp_setting.spe_nodes:
                    sp_nodes.append(nod)
                new_sp_setting = species_setting(
                    spe_nodes=sp_nodes,
                    root=sp_setting.root,
                    sp_rate=sp_setting.spe_rate,
                    fix_sp_rate=sp_setting.fix_spe_rate,
                    minbr=self.min_brl,
                )
                if frozenset(sp_nodes) in self.setting_set:
                    pass
                else:
                    self.next(new_sp_setting)

    def H0(self, reroot=True):
        self.H1(reroot)
        self.H2(reroot=False)
        self.run_h3(reroot=False)

    def H1(self, reroot=True):
        if reroot:
            self.re_rooting()

        # self.init_tree()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()

        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(
            spe_nodes=first_node_list,
            root=self.tree,
            sp_rate=self.fix_spe,
            fix_sp_rate=self.fix_spe_rate,
            minbr=self.min_brl,
        )
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting

        for node in sorted_node_list:
            if node not in last_setting.spe_nodes:
                curr_sp_nodes = []
                for nod in last_setting.spe_nodes:
                    curr_sp_nodes.append(nod)

                chosen_branching_node = (node.up
                                         )  # find the father of this new node
                if chosen_branching_node in last_setting.spe_nodes:
                    for nod in chosen_branching_node.get_children():
                        if nod not in curr_sp_nodes:
                            curr_sp_nodes.append(nod)
                else:
                    for nod in chosen_branching_node.get_children():
                        if nod not in curr_sp_nodes:
                            curr_sp_nodes.append(nod)
                    while not chosen_branching_node.is_root():
                        chosen_branching_node = chosen_branching_node.up
                        for nod in chosen_branching_node.get_children():
                            if nod not in curr_sp_nodes:
                                curr_sp_nodes.append(nod)
                        if chosen_branching_node in last_setting.spe_nodes:
                            break
                new_setting = species_setting(
                    spe_nodes=curr_sp_nodes,
                    root=self.tree,
                    sp_rate=self.fix_spe,
                    fix_sp_rate=self.fix_spe_rate,
                    minbr=self.min_brl,
                )
                new_logl = new_setting.get_log_l()
                if new_logl > max_logl:
                    max_logl = new_logl
                    max_setting = new_setting
                last_setting = new_setting

            else:
                """node already is a speciation node, do nothing"""
                pass

        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting

    def H2(self, reroot=True):
        """Greedy"""
        if reroot:
            self.re_rooting()

        # self.init_tree()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()

        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(
            spe_nodes=first_node_list,
            root=self.tree,
            sp_rate=self.fix_spe,
            fix_sp_rate=self.fix_spe_rate,
            minbr=self.min_brl,
        )
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting
        contin_flag = True

        while contin_flag:
            curr_max_logl = float("-inf")
            curr_max_setting = None
            contin_flag = False
            for node in last_setting.active_nodes:
                if node.is_leaf():
                    pass
                else:
                    contin_flag = True
                    childs = node.get_children()
                    sp_nodes = []
                    for child in childs:
                        sp_nodes.append(child)
                    for nod in last_setting.spe_nodes:
                        sp_nodes.append(nod)
                    new_sp_setting = species_setting(
                        spe_nodes=sp_nodes,
                        root=self.tree,
                        sp_rate=self.fix_spe,
                        fix_sp_rate=self.fix_spe_rate,
                        minbr=self.min_brl,
                    )
                    logl = new_sp_setting.get_log_l()
                    if logl > curr_max_logl:
                        curr_max_logl = logl
                        curr_max_setting = new_sp_setting

            if curr_max_logl > max_logl:
                max_setting = curr_max_setting
                max_logl = curr_max_logl

            last_setting = curr_max_setting

        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting

    def run_h3(self, reroot=True):
        if reroot:
            self.re_rooting()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()
        sorted_br = []
        for node in sorted_node_list:
            sorted_br.append(node.dist)
        maxlogl = float("-inf")
        maxidx = -1
        for i in range(len(sorted_node_list))[1:]:
            l1 = sorted_br[0:i]
            l2 = sorted_br[i:]
            e1 = exp_distribution(l1)
            e2 = exp_distribution(l2)
            logl = e1.sum_log_l() + e2.sum_log_l()
            if logl > maxlogl:
                maxidx = i
                maxlogl = logl

        target_nodes = sorted_node_list[0:maxidx]

        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(
            spe_nodes=first_node_list,
            root=self.tree,
            sp_rate=self.fix_spe,
            fix_sp_rate=self.fix_spe_rate,
            minbr=self.min_brl,
        )
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting
        contin_flag = True
        target_node_cnt = 0
        while contin_flag:
            curr_max_logl = float("-inf")
            curr_max_setting = None
            contin_flag = False
            unchanged_flag = True
            for node in last_setting.active_nodes:
                if node.is_leaf():
                    pass
                else:
                    contin_flag = True
                    childs = node.get_children()
                    sp_nodes = []
                    flag = False
                    for child in childs:
                        if child in target_nodes:
                            flag = True
                    # target_nodes.remove(child)
                    if flag:
                        unchanged_flag = False
                        for child in childs:
                            sp_nodes.append(child)
                        for nod in last_setting.spe_nodes:
                            sp_nodes.append(nod)
                        new_sp_setting = species_setting(
                            spe_nodes=sp_nodes,
                            root=self.tree,
                            sp_rate=self.fix_spe,
                            fix_sp_rate=self.fix_spe_rate,
                            minbr=self.min_brl,
                        )
                        logl = new_sp_setting.get_log_l()
                        if logl > curr_max_logl:
                            curr_max_logl = logl
                            curr_max_setting = new_sp_setting
            if not unchanged_flag:
                target_node_cnt = target_node_cnt + 1
                if curr_max_logl > max_logl:
                    max_setting = curr_max_setting
                    max_logl = curr_max_logl
                last_setting = curr_max_setting

            if len(target_nodes) == target_node_cnt:
                contin_flag = False
            if contin_flag and unchanged_flag and last_setting != None:
                for node in last_setting.active_nodes:
                    if node.is_leaf():
                        pass
                    else:
                        childs = node.get_children()
                        sp_nodes = []
                        for child in childs:
                            sp_nodes.append(child)
                        for nod in last_setting.spe_nodes:
                            sp_nodes.append(nod)
                        new_sp_setting = species_setting(
                            spe_nodes=sp_nodes,
                            root=self.tree,
                            sp_rate=self.fix_spe,
                            fix_sp_rate=self.fix_spe_rate,
                            minbr=self.min_brl,
                        )
                        logl = new_sp_setting.get_log_l()
                        if logl > curr_max_logl:
                            curr_max_logl = logl
                            curr_max_setting = new_sp_setting
                if curr_max_logl > max_logl:
                    max_setting = curr_max_setting
                    max_logl = curr_max_logl
                last_setting = curr_max_setting

        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting

    def Brutal(self, reroot=False):
        if reroot:
            self.re_rooting()
        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        num_s = self.comp_num_comb()
        if num_s > self.max_num_search:
            print("Too many search iterations: " + repr(num_s) +
                  ", using H0 instead!!!")
            self.H0(reroot=False)
        else:
            first_setting = species_setting(
                spe_nodes=first_node_list,
                root=self.tree,
                sp_rate=self.fix_spe,
                fix_sp_rate=self.fix_spe_rate,
                minbr=self.min_brl,
            )
            self.next(first_setting)

    def search(self, strategy="H1", reroot=False):
        if strategy == "H1":
            self.H1(reroot)
        elif strategy == "H2":
            self.H2(reroot)
        elif strategy == "H3":
            self.run_h3(reroot)
        elif strategy == "Brutal":
            self.Brutal(reroot)
        else:
            self.H0(reroot)

    def count_species(self, print_log=True, pv=0.001):
        lhr = lh_ratio_test(self.null_logl, self.max_logl, 1)
        pvalue = lhr.get_p_value()
        if print_log:
            print("Speciation rate: " +
                  "{0:.3f}".format(self.max_setting.rate2))
            print("Coalesecnt rate: " +
                  "{0:.3f}".format(self.max_setting.rate1))
            print("Null logl: " + "{0:.3f}".format(self.null_logl))
            print("MAX logl: " + "{0:.3f}".format(self.max_logl))
            print("P-value: " + "{0:.3f}".format(pvalue))
            spefit, speaw = self.max_setting.e2.ks_statistic()
            coafit, coaaw = self.max_setting.e1.ks_statistic()
            print("Kolmogorov-Smirnov test for model fitting:")
            print("Speciation: " + "Dtest = {0:.3f}".format(spefit) + " " +
                  speaw)
            print("Coalescent: " + "Dtest = {0:.3f}".format(coafit) + " " +
                  coaaw)
        if pvalue < pv:
            num_sp, self.species_list = self.max_setting.count_species()
            return num_sp
        else:
            self.species_list = []
            self.species_list.append(self.tree.get_leaf_names())
            return 1

    def whitening_search(self, strategy="H1", reroot=False, pv=0.001):
        self.search(strategy, reroot, pv)
        num_sp, self.species_list = self.max_setting.count_species()
        spekeep = self.max_setting.whiten_species()
        self.tree.prune(spekeep)
        self.max_logl = float("-inf")
        self.max_setting = None
        self.null_logl = 0.0
        self.null_model()
        self.species_list = None
        self.counter = 0
        self.setting_set = set([])
        self.search(strategy, reroot, pv)

    def print_species(self):
        cnt = 1
        for sp in self.species_list:
            print("Species " + repr(cnt) + ":")
            for leaf in sp:
                print("          " + leaf)
            cnt = cnt + 1

    def output_species(self, taxa_order=[]):
        """taxa_order is a list of taxa names, the paritions will be output as the same order"""
        if len(taxa_order) == 0:
            taxa_order = self.tree.get_leaf_names()

        num_taxa = 0
        for sp in self.species_list:
            for leaf in sp:
                num_taxa = num_taxa + 1
        if not len(taxa_order) == num_taxa:
            print("error error, taxa_order != num_taxa!")
            return None, None
        else:
            partion = [-1] * num_taxa
            cnt = 1
            for sp in self.species_list:
                for leaf in sp:
                    idx = taxa_order.index(leaf)
                    partion[idx] = cnt
                cnt = cnt + 1
            return taxa_order, partion
Example #4
0
from ete3 import Tree
import sys

t = Tree(sys.argv[1])

#print dir(t)
tnode = t.get_children()[0]
print tnode.name
print tnode.children
Example #5
0
def main(arg1,arg2): 

    with open(arg1) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content] 
    tree2=Tree(content[0])
    ##print(t2)
    triplets=[]
    taxa=[]
    leaf_sets=[]
    for i in range(0,len(content)): 
    
        taxa_tree=[]
        t2=Tree(content[i])
        for leaf in t2:
            taxa_tree.append(leaf.name)
        leaf_sets.append(taxa_tree)
        taxa+=list(taxa_tree)
        tree1=Tree(content[i])
        ##print("start_tree1",t1)
        ##print("start_tree2",t2)
        tree2=Tree(scm_last.scm(tree1,tree2))
    ##print(tree2)
    merge_taxa=[]
    merge_count=0
    merge_lookup={}
    for node in tree2.traverse("postorder"):

  # Do some analysis on node
        if (node.is_root() is False) and (node.is_leaf() is False):
            children=node.get_children()
            if len(children) >2:
                merge_taxa.append([])
                for c in children:
                    if c.is_leaf():
                        merge_taxa[merge_count].append(c.name)
                        merge_lookup[c.name]=merge_count
                merge_count +=1 
    ##print(merge_taxa)
    ##print(merge_lookup)
    taxa=set(taxa)
    d = {ni: indi for indi, ni in enumerate(set(taxa))}
    inv_map = {v: k for k, v in d.items()}

    ##print(d,inv_map)
    a=0
    counter=0
    #a[:] = '?'
    for i in range(0,len(content)): 

        t2=Tree(content[i])
        missing_taxa=set(taxa)-set(leaf_sets[i])
        tree_right=t2.get_children()[0]
        tree_left=t2.get_children()[1]
        temp_r,a,counter=make_mrp(tree_right,a,d,missing_taxa,counter)
        temp_l,a,counter=make_mrp(tree_left,a,d,missing_taxa,counter)
        
    connections=[]
    missing=[]
    bad_connect=[]
    ##print(len( a.T))
    counter=0
    int_nodes=range(1, len( a.T)+1)
    node=[]
    for i in int_nodes:
        node.append(str(i))
    for column in a.T:
       ##print("#print column",column)
       counter +=1
       for i in range(len(column)):
            if column[i]== 1:
                ##print(inv_map[i])
                #if inv_map[i] in merge_lookup:
                    ##print("look up is", merge_taxa[merge_lookup[inv_map[i]]])
                    
                connections.append([str(counter),inv_map[i]])
                connections.append([inv_map[i],str(counter)])
                
            elif column[i]== 2:
                ##print(inv_map[i])
                missing.append([str(counter),inv_map[i]])
            else:
                bad_connect.append([str(counter),inv_map[i]])
    ''' semi=[]
    counter=0
    for column in a.T:
        counter +=1
        ##print(column)
        if(np.prod(column)!=0):
            semi.append(counter)'''
    ##print(connections)
    g=Graph(connections,missing,bad_connect,node)  
    ##print(g._graph_good)
    #hs_size=len(d)+counter-len(semi)
    tree_new=Tree()
    semi=find_semiUni(g)
    for sem in semi:
        g.delete_semi(str(sem))
        ##print("semi:",sem)
    ##print(g._graph_good[])
    ##print("plot_start",time.time())
    clades,clades_connects,all_clades=connections_div(g,taxa)
    ##print("plot_stop",time.time())
    ##print("all is",all_clades)
    ##print(len(clades))
    if len(clades)==1: 
        
        g_HS=deepcopy(g)
        all_clades,clades,clades_connects=plot_HS(g_HS,taxa)
        g_HS=None
 
        #print(clades)
        #print(clades_connects)
        ##print(g._graph_good)
    ##print("all clade",clades)
    newick=""
    for ci in range(len(clades_connects)):
        clade=clades_connects[ci]
        ci_taxa=clades[ci]
        g_ci=deepcopy(g)
       
        delete_clade=set(all_clades)-clade[0]
        delete_leaves=taxa-ci_taxa[0]

        #print(delete_leaves)
        for c in delete_clade:
             ##print("cladestep",c)
             g_ci.delete_connections(c)
        
        for l in delete_leaves:
            g_ci.delete_leaf_complete(l)
        ##print("deleted clades",delete_clade)
        ##print("deltetd leaes",delete_leaves)
        ##print("clades keep",clade[0])
        ##print("leaves keep",ci_taxa[0])
        ##print(g_ci._graph_good)
        #BCD(g_ci,ci_taxa[0])
        
        #tree=BCD(g_ci,ci_taxa[0])
        newick+=BCD(g_ci,ci_taxa[0])
        #if tree != "error":
            #tree_new.add_child(tree)
    newick="("+ newick+");"
    print(newick)
    tree_new.show()
Example #6
0
class Phylo_Tree_Drawer():
    # Ranks we always want to show
    sig_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'species']

    def __init__(self, ncbi):
        self.t = Tree()
        self.style = TreeStyle()
        self.style.show_leaf_name = False
        self.style.show_scale = False
        self.ncbi = ncbi
        self.is_collapsed = False

    def add_new_lineage(self, lineage, num_reads):
        '''
        Add lineage to tree
        :param lineage: lineage array as returned by NCBITaxa.get_lineage()
        :param num_reads: the number of reads you are classifying as part of the lineage (additive)
        '''
        if self.is_collapsed:
            raise RuntimeError("You may not add new lineages after drawing or creating directories")

        def recursive_helper(parent, lineage):
            if lineage[0] in [x.name for x in parent.get_children()]:
                child = parent.search_nodes(name=lineage[0])[0]
            else:
                child = parent.add_child(name=lineage[0])

            if len(lineage) > 1:
                recursive_helper(child, lineage[1:])
            elif 'num_reads' not in child.features:
                child.add_feature('num_reads', num_reads)
            else:
                child.num_reads = child.num_reads + num_reads

        recursive_helper(self.t, lineage)

    def draw(self, full_tree_path, simplified_tree_path, significance_ratio):
        '''
        Output full tree image at full_tree_path.
        Output simplified tree image at simplified_tree_path.
        Significance_ratio is the percent abundance needed to be considered significant. (Significant nodes are
        highlighted and the is a simplified tree image to show significant nodes.)
        Only call after you are done adding lineages.
        Requires X server
        '''
        self.ncbi.annotate_tree(self.t)
        if not self.is_collapsed:
            self._collapse_tree()
            self.is_collapsed = True

        # calculate total # of reads
        total_reads = 0
        for node in self.t.traverse():
            if 'num_reads' in node.features:
                total_reads += node.num_reads

        sig_threshold = round(total_reads * significance_ratio)
        self._add_text_faces(sig_threshold, total_reads)

        # Draw full tree image
        self.t.render(full_tree_path, w=35, units='in', tree_style=self.style)

        # Draw simplified tree image
        self._collapse_tree(sig_threshold)
        self.t.render(simplified_tree_path, w=35, units='in', tree_style=self.style)

    def create_directories(self, path, sequence_dict):
        '''
        This function will create a directory tree in the shape of the phylogenetic tree and deposit sequences there
        :param path: Root directory path
        '''
        self.ncbi.annotate_tree(self.t)
        if not self.is_collapsed:
            self._collapse_tree()
            self.is_collapsed = True

        def create_folder(path):
            head, tail = os.path.split(path)
            if head and not os.path.isdir(head):
                create_folder(head)

            if not os.path.isdir(path):
                os.mkdir(path)

        def slugify(value):
            """
            Convert spaces to underscores. Remove characters that aren't alphanumerics, underscores, or hyphens.
            Convert to lowercase. Also strip leading and trailing whitespace.
            """
            value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
            value = re.sub(r'[^\w\s-]', '', value).strip().lower()
            return re.sub(r'[-\s]+', '_', value)

        def recursive_helper(node, *folders):
            name_slug = slugify(node.sci_name)
            full_path = os.path.join(path, *folders, name_slug)
            create_folder(full_path)

            # If sequences belong to this taxon, deposit them here
            if node.taxid in sequence_dict:
                SeqIO.write(
                    sequence_dict[node.taxid], os.path.join(full_path, '%s_sequences.fasta' % name_slug), 'fasta'
                )

            for child in node.get_children():
                recursive_helper(child, *folders, name_slug)

        for top_level_node in self.t.get_children():
            recursive_helper(top_level_node)

    def _add_text_faces(self, highlight_treshold, total_reads):
        '''
        Add labels to the image
        :param highlight_treshold: If node has over this number of reads, highlight the textface
        '''
        for node in self.t.traverse():
            label_text = '%s\nRank: %s' % (node.sci_name, node.rank)
            num_reads = 0
            if 'num_reads' in node.features:
                num_reads = node.num_reads
                label_text = '%s\nNumber of Reads: %i' % (label_text, num_reads)

            if num_reads >= highlight_treshold:
                label_text = '%s\nAbundance: %s' % (label_text, '{:.2%}'.format(num_reads/total_reads))

            face = TextFace(label_text)

            if num_reads >= highlight_treshold:
                face.background.color = "Moccasin"

            node.add_face(face, column=0)

    def _collapse_tree(self, min_reads=1):
        '''
        Remove nodes which do not have at least 'min_reads'(int) assigned and are not a significant rank
        '''
        def recursive_helper(node):
            children = node.get_children()

            num_reads = 0
            if 'num_reads' in node.features:
                num_reads = node.num_reads

            has_sig_ancestor = False
            for child in children:
                if recursive_helper(child):
                    has_sig_ancestor = True

            if num_reads >= min_reads:
                return True

            if not (node.rank in self.sig_ranks and has_sig_ancestor):
                node.delete(prevent_nondicotomic=False)

            return has_sig_ancestor

        for child in self.t.get_children():
            recursive_helper(child)
Example #7
0
from ete3 import Tree
t = Tree()
# We create a random tree topology
t.populate(15)
print t
print t.children
print t.get_children()
print t.up
print t.name
print t.dist
print t.is_leaf()
print t.get_tree_root()
print t.children[0].get_tree_root()
print t.children[0].children[0].get_tree_root()
# You can also iterate over tree leaves using a simple syntax
for leaf in t:
      print leaf.name
Example #8
0
from ete3 import Tree
t = Tree()
# We create a random tree topology
t.populate(15)
print t
print t.children
print t.get_children()
print t.up
print t.name
print t.dist
print t.is_leaf()
print t.get_tree_root()
print t.children[0].get_tree_root()
print t.children[0].children[0].get_tree_root()
# You can also iterate over tree leaves using a simple syntax
for leaf in t:
    print leaf.name
Example #9
0
					# NB. using the node.prune() function is too slow

					# actualize "mergedInd" feature of new leaf
					newLeaf[0].mergedInd = mergedLeaves

				else:

					# populate "mergedInd" feature for future SFS
					mergedLeaves = ""
					for l in node.iter_leaves():
						mergedLeaves = mergedLeaves+" "+l.name

					# collapse the subtree
					newLeaf = t.get_farthest_leaf()
					for child in t.get_children():
						child.detach()
					node.add_child(newLeaf[0], newLeaf[0].name, newLeaf[1])

					# actualize "mergedInd" feature of new leaf
					newLeaf[0].mergedInd = mergedLeaves

nTrueSpecies = len(t)
sys.stdout.write('C')

#======================================================#
# COMPUTE & EXPORT the SFS (full names & numerical)

f = open(osfs+"_allinds.txt", 'w+')
fn = open(osfs, 'w+')
fn.write("Tip_label\t" + "\t".join("Isl_"+str(x) for x in range(0, nPops)) + "\n")
        f.write('BINX, sc=' + str(x + 1) + '-' + str(x + y) + '\n')
    fCcScMtx = pd.concat([fCcMtx, fScMtx], axis=1)
    with open('families/' + fam + '.cc_sc.phy', 'w') as f:
        f.write(' '.join(map(str, np.shape(fCcScMtx))) + '\n')
        for l in fCcScMtx.index:
            f.write(l.ljust(pad))
            rw = np.array(fCcScMtx.loc[l].values, str)
            rw[rw == '-1'] = '-'
            f.write(''.join(rw) + '\n')
    cmd = raxmlCommand + " -T 20 -g " + fam
    cmd += ".tre -c 4 -m BINGAMMAX -s " + fam
    cmd += ".cc_sc.phy -q " + fam + ".part.txt -n " + fam + "_rooted -p 12345"
    p = subprocess.Popen(cmd, shell=True, cwd=os.getcwd() + '/families/')
    os.waitpid(p.pid, 0)

    fTree = Tree('families/RAxML_bestTree.' + fam + '_rooted')
    fTree.resolve_polytomy()
    fTree.set_outgroup(fTree & outgroup)
    (fTree & outgroup).delete(preserve_branch_length=True)
    p = subprocess.Popen("rm -f RAxML*",
                         shell=True,
                         cwd=os.getcwd() + '/families/')
    os.waitpid(p.pid, 0)
    p = subprocess.Popen("rm -f " + fam + ".cc_sc.phy* " + fam + ".part.txt",
                         shell=True,
                         cwd=os.getcwd() + '/families/')
    os.waitpid(p.pid, 0)
    fTree.get_children()[0].write(outfile="outgroupRooted/" + fam +
                                  ".outgroupRooted.tre",
                                  format=1)
repeat_tree_file = sys.argv[1]  #IQ tree output to be rooted
gene_tree_file = sys.argv[2]  #nhx tree, subtree generated by ensembl_api.py

input_folder = 'genetrees_nhx'
output_folder = 'denovo/treefix'
outputs_file = gene_tree_file.replace(input_folder, output_folder)
smap_file = outputs_file.replace('.nhx', '.smap')
stree_file = outputs_file.replace('.nhx', '.stree')
rooted_file = repeat_tree_file + '.rooted'

#write rooted tree
with open(repeat_tree_file, 'r') as tree_string:
    tree_string = tree_string.readline()
repeat_tree = Tree(tree_string, format=1)
topnode = repeat_tree.get_children()[0]
repeat_tree.set_outgroup(topnode)
repeat_tree.write(format=1, outfile=rooted_file)

with open(gene_tree_file, 'r') as gene_tree:
    gene_tree_str = gene_tree.read()

gene_tree = Tree(gene_tree_str, format=1)

with open(smap_file, 'w') as smap:
    for l in gene_tree.get_leaves():
        smap.write(l.name + '*\t' + l.name + '\n')

with open(stree_file, 'w') as stree:
    stree.write(gene_tree_str.replace('[&&NHX:D=D]', ''))
Example #12
0
def parse_biopp_history(history_path, base_tree_path):
    base_tree = Tree(base_tree_path, format=1)
    node_data_regex = re.compile("([^(|)]*?)\{(\d)\}")
    # read the tree from the file
    history = Tree(history_path, format=1)

    # root the history in the same location as the base tree
    root_child = base_tree.get_children()[0]
    for node in history.traverse("postorder"):
        if root_child.name in node.name:
            history.set_outgroup(node)
            break

    for node in history.traverse("postorder"):
        if node != history.get_tree_root():
            node_name = (node_data_regex.search(node.name)).group(1)
            node_state = (node_data_regex.search(node.name)).group(2)
            if "missing_node" in node_name:
                parent = base_tree.search_nodes(
                    name=node.get_children()[0].name)[0].up
                node_name = parent.name
            node.name = node_name
            if node_state == "0":
                node.add_feature("label", "0")
            else:
                node.add_feature("label", "1")
        else:
            # check if the root has more than 2 children, and if yes, reroot the tree in accordance with the base tree
            if len(node.get_children()) > 2:
                original_children = base_tree.get_tree_root().get_children()
                current_children = history.get_children()
                missing_node = original_children[0]
                missing_children = []
                in_curr = False
                for orig_node in original_children:
                    for curr_node in current_children:
                        if orig_node.name in curr_node.name:
                            in_curr = True
                    if not in_curr:
                        missing_node = orig_node
                    in_curr = False
                apparent_node = [
                    node for node in original_children
                    if not node == missing_node
                ][0]
                missing_children = [
                    node for node in current_children
                    if not apparent_node.name in node.name
                ]
                new = history.get_tree_root().add_child(child=None,
                                                        name=missing_node.name,
                                                        dist=missing_node.dist,
                                                        support=None)
                # now make the two extra children of the current root the children of the new node
                for child in missing_children:
                    child.detach()
                    new.add_child(child=child)
                # set the label as pwe the mp solution
                if new.get_children()[0].label == new.get_children()[1].label:
                    new.add_feature("label", new.get_children()[0].label)
                else:
                    apparent_node_in_history = \
                    [node for node in history.get_tree_root().get_children() if apparent_node.name in node.name][0]
                    new.add_feature("label", apparent_node_in_history.label)
                node.add_feature("label", new.label)  # root is always BG
            else:
                node.add_feature("label", node.get_children()[0].label)
    history.get_tree_root().name = "root"

    # lastly, set base internal nodes names according to the base tree
    for node in history.traverse("postorder"):
        if "base" in node.name:
            # get the respective base node from the base tree based on the already treated children in the history
            child = node.get_children()[0]
            while "mapping" in child.name:
                child = child.get_children()[0]
            base_parent = base_tree.search_nodes(
                name=child.name.rstrip())[0].up
            node.name = base_parent.name

    return history
Example #13
0
class exponential_mixture:
    """ML search PTP, to use: __init__(), search() and count_species()"""
    def __init__(self, tree, sp_rate = 0, fix_sp_rate = False, max_iters = 20000, min_br = 0.0001):
        self.min_brl = min_br
        self.tree = Tree(tree, format = 1)
        self.tree.resolve_polytomy(recursive=True)
        self.tree.dist = 0.0
        self.fix_spe_rate = fix_sp_rate
        self.fix_spe = sp_rate
        self.max_logl = float("-inf") 
        self.max_setting = None
        self.null_logl = 0.0
        self.null_model()
        self.species_list = None
        self.counter = 0
        self.setting_set = set([])
        self.max_num_search = max_iters


    def null_model(self):
        coa_br = []
        all_nodes = self.tree.get_descendants()
        for node in all_nodes:
            if node.dist > self.min_brl:
                coa_br.append(node.dist)
        e1 = exp_distribution(coa_br)
        self.null_logl = e1.sum_log_l()
        return e1.rate


    def __compare_node(self, node):
        return node.dist


    def re_rooting(self):
        node_list = self.tree.get_descendants()
        node_list.sort(key=self.__compare_node)
        node_list.reverse()
        rootnode = node_list[0]
        self.tree.set_outgroup(rootnode)
        self.tree.dist = 0.0


    def comp_num_comb(self):
        for node in self.tree.traverse(strategy='postorder'):
            if node.is_leaf():
                node.add_feature("cnt", 1.0)
            else:
                acum = 1.0
                for child in node.get_children():
                    acum = acum * child.cnt
                acum = acum + 1.0
                node.add_feature("cnt", acum)
        return self.tree.cnt


    def next(self, sp_setting):
        self.setting_set.add(frozenset(sp_setting.spe_nodes))
        logl = sp_setting.get_log_l()
        if logl > self.max_logl:
            self.max_logl = logl
            self.max_setting = sp_setting
        for node in sp_setting.active_nodes:
            if node.is_leaf():
                pass
            else:
                childs = node.get_children()
                sp_nodes = []
                for child in childs:
                    sp_nodes.append(child)
                for nod in sp_setting.spe_nodes:
                    sp_nodes.append(nod)
                new_sp_setting = species_setting(spe_nodes = sp_nodes, root = sp_setting.root, sp_rate = sp_setting.spe_rate, fix_sp_rate = sp_setting.fix_spe_rate, minbr = self.min_brl)
                if frozenset(sp_nodes) in self.setting_set:
                    pass
                else:
                    self.next(new_sp_setting)


    def H0(self, reroot = True):
        self.H1(reroot)
        self.H2(reroot = False)
        self.H3(reroot = False)


    def H1(self, reroot = True):
        if reroot:
            self.re_rooting()
            
        #self.init_tree()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()
        
        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting
        
        for node in sorted_node_list:
            if node not in last_setting.spe_nodes:
                curr_sp_nodes = []
                for nod in last_setting.spe_nodes:
                    curr_sp_nodes.append(nod)
                
                chosen_branching_node = node.up #find the father of this new node
                if chosen_branching_node in last_setting.spe_nodes:
                    for nod in chosen_branching_node.get_children():
                        if nod not in curr_sp_nodes:
                            curr_sp_nodes.append(nod)
                else:
                    for nod in chosen_branching_node.get_children():
                        if nod not in curr_sp_nodes:
                            curr_sp_nodes.append(nod)
                    while not chosen_branching_node.is_root():
                        chosen_branching_node = chosen_branching_node.up
                        for nod in chosen_branching_node.get_children():
                            if nod not in curr_sp_nodes:
                                curr_sp_nodes.append(nod)
                        if chosen_branching_node in last_setting.spe_nodes:
                            break
                new_setting = species_setting(spe_nodes = curr_sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
                new_logl = new_setting.get_log_l()
                if new_logl> max_logl:
                    max_logl = new_logl
                    max_setting = new_setting 
                last_setting = new_setting
                
            else:
                """node already is a speciation node, do nothing"""
                pass
        
        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting


    def H2(self, reroot = True):
        """Greedy"""
        if reroot:
            self.re_rooting()
            
        #self.init_tree()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()
        
        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting
        contin_flag = True 
        
        
        while contin_flag:
            curr_max_logl = float("-inf") 
            curr_max_setting = None
            contin_flag = False
            for node in last_setting.active_nodes:
                if node.is_leaf():
                    pass
                else:
                    contin_flag = True 
                    childs = node.get_children()
                    sp_nodes = []
                    for child in childs:
                        sp_nodes.append(child)
                    for nod in last_setting.spe_nodes:
                        sp_nodes.append(nod)
                    new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
                    logl = new_sp_setting.get_log_l()
                    if logl > curr_max_logl:
                        curr_max_logl = logl
                        curr_max_setting = new_sp_setting
            
            if curr_max_logl > max_logl:
                max_setting = curr_max_setting
                max_logl = curr_max_logl
            
            last_setting = curr_max_setting
            
        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting


    def H3(self, reroot = True):
        if reroot:
            self.re_rooting()
        sorted_node_list = self.tree.get_descendants()
        sorted_node_list.sort(key=self.__compare_node)
        sorted_node_list.reverse()
        sorted_br = []
        for node in sorted_node_list:
            sorted_br.append(node.dist)
        maxlogl = float("-inf") 
        maxidx = -1
        for i in range(len(sorted_node_list))[1:]:
            l1 = sorted_br[0:i]
            l2 = sorted_br[i:]
            e1 = exp_distribution(l1)
            e2 = exp_distribution(l2)
            logl = e1.sum_log_l() + e2.sum_log_l()
            if logl > maxlogl:
                maxidx = i
                maxlogl = logl
        
        target_nodes = sorted_node_list[0:maxidx]
        
        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
        last_setting = first_setting
        max_logl = last_setting.get_log_l()
        max_setting = last_setting
        contin_flag = True 
        target_node_cnt = 0
        while contin_flag:
            curr_max_logl = float("-inf") 
            curr_max_setting = None
            contin_flag = False
            unchanged_flag = True
            for node in last_setting.active_nodes:
                if node.is_leaf():
                    pass
                else:
                    contin_flag = True 
                    childs = node.get_children()
                    sp_nodes = []
                    flag = False
                    for child in childs:
                        if child in target_nodes:
                            flag = True
                            #target_nodes.remove(child)
                    if flag:
                        unchanged_flag = False
                        for child in childs:
                            sp_nodes.append(child)
                        for nod in last_setting.spe_nodes:
                            sp_nodes.append(nod)
                        new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
                        logl = new_sp_setting.get_log_l()
                        if logl > curr_max_logl:
                            curr_max_logl = logl
                            curr_max_setting = new_sp_setting
            if not unchanged_flag:
                target_node_cnt = target_node_cnt + 1
                if curr_max_logl > max_logl:
                    max_setting = curr_max_setting
                    max_logl = curr_max_logl
                last_setting = curr_max_setting
            
            if len(target_nodes) == target_node_cnt:
                contin_flag = False
            if contin_flag and unchanged_flag and last_setting!= None:
                for node in last_setting.active_nodes:
                    if node.is_leaf():
                        pass
                    else:
                        childs = node.get_children()
                        sp_nodes = []
                        for child in childs:
                            sp_nodes.append(child)
                        for nod in last_setting.spe_nodes:
                            sp_nodes.append(nod)
                        new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
                        logl = new_sp_setting.get_log_l()
                        if logl > curr_max_logl:
                            curr_max_logl = logl
                            curr_max_setting = new_sp_setting
                if curr_max_logl > max_logl:
                    max_setting = curr_max_setting
                    max_logl = curr_max_logl
                last_setting = curr_max_setting
                
        if max_logl > self.max_logl:
            self.max_logl = max_logl
            self.max_setting = max_setting


    def Brutal(self, reroot = False):
        if reroot:
            self.re_rooting()
        first_node_list = []
        first_node_list.append(self.tree)
        first_childs = self.tree.get_children()
        for child in first_childs:
            first_node_list.append(child)
        num_s = self.comp_num_comb()
        if num_s > self.max_num_search:
            print("Too many search iterations: " + repr(num_s) + ", using H0 instead!!!")
            self.H0(reroot = False)
        else:
            first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl)
            self.next(first_setting)


    def search(self, strategy = "H1", reroot = False):
        if strategy == "H1":
            self.H1(reroot)
        elif strategy == "H2":
            self.H2(reroot)
        elif strategy == "H3":
            self.H3(reroot)
        elif strategy == "Brutal":
            self.Brutal(reroot)
        else:
            self.H0(reroot)


    def count_species(self, print_log = True, pv = 0.001):
        lhr = lh_ratio_test(self.null_logl, self.max_logl, 1)
        pvalue = lhr.get_p_value()
        if print_log:
            print("Speciation rate: " + "{0:.3f}".format(self.max_setting.rate2))
            print("Coalesecnt rate: " + "{0:.3f}".format(self.max_setting.rate1))
            print("Null logl: " + "{0:.3f}".format(self.null_logl))
            print("MAX logl: " + "{0:.3f}".format(self.max_logl))
            print("P-value: " + "{0:.3f}".format(pvalue))
            spefit, speaw = self.max_setting.e2.ks_statistic()
            coafit, coaaw = self.max_setting.e1.ks_statistic()
            print("Kolmogorov-Smirnov test for model fitting:")
            print("Speciation: " + "Dtest = {0:.3f}".format(spefit) + " " + speaw)
            print("Coalescent: " + "Dtest = {0:.3f}".format(coafit) + " " + coaaw)
        if pvalue < pv:
            num_sp, self.species_list = self.max_setting.count_species()
            return num_sp
        else:
            self.species_list = []
            self.species_list.append(self.tree.get_leaf_names()) 
            return 1


    def whitening_search(self, strategy = "H1", reroot = False, pv = 0.001):
        self.search(strategy, reroot, pv)
        num_sp, self.species_list = self.max_setting.count_species()
        spekeep = self.max_setting.whiten_species()
        self.tree.prune(spekeep)
        self.max_logl = float("-inf") 
        self.max_setting = None
        self.null_logl = 0.0
        self.null_model()
        self.species_list = None
        self.counter = 0
        self.setting_set = set([])
        self.search(strategy, reroot, pv)


    def print_species(self):
        cnt = 1
        for sp in self.species_list:
            print("Species " + repr(cnt) + ":")
            for leaf in sp:
                print("          " + leaf)
            cnt = cnt + 1


    def output_species(self, taxa_order = []):
        """taxa_order is a list of taxa names, the paritions will be output as the same order"""
        if len(taxa_order) == 0:
            taxa_order = self.tree.get_leaf_names()
        
        num_taxa = 0
        for sp in self.species_list:
            for leaf in sp:
                num_taxa = num_taxa + 1
        if not len(taxa_order) == num_taxa:
            print("error error, taxa_order != num_taxa!")
            return None, None
        else: 
            partion = [-1] * num_taxa
            cnt = 1
            for sp in self.species_list:
                for leaf in sp:
                    idx = taxa_order.index(leaf)
                    partion[idx] = cnt
                cnt = cnt + 1
            return taxa_order, partion