def reroot(history_path, tree_path): history = Tree(history_path, format=1) tree = Tree(tree_path) if len(tree.get_children()) > 2: print("original tree is not rooted, no need to fix history") return missing_length = size(tree) - size(history) tree_children = tree.get_children() if abs(tree_children[0].dist-missing_length) < abs(tree_children[1].dist-missing_length): missing_child = tree_children[0] else: missing_child = tree_children[1] grandchildren_dists = [child.dist for child in missing_child.get_children()] children_to_detach = [] labels = [] for child in history.get_children(): if child.dist in grandchildren_dists: if "{0}" in child.name: labels.append(0) else: labels.append(1) children_to_detach.append(child.detach()) remaining_child = history.get_children()[-1] if "{0}" in remaining_child.name: labels.append(0) else: labels.append(1) if np.sum(labels) < 2: chosen_label = 0 else: chosen_label = 1 new_child = history.get_tree_root().add_child(name="missing_node{" + str(chosen_label) + "}", dist=missing_length) for child in children_to_detach: new_child.add_child(child) # make sure that now the tree and the history are of the same length if abs(size(history)-size(tree)) > 0.00001: print("Error! failed to fix history tree") print("size(history) = ", size(history) , "\n size(tree) = ", size(tree)) exit (1) # make sure that all the lengths were written to tree string history_str = history.write(outfile=None, format=1) new_history = Tree(history_str, format=1) if abs(size(new_history)-size(tree)) > 0.00001: print("Error! failed to fix history tree newick format") print("size(history) = ", size(history) , "\n size(tree) = ", np.sum(bls)) exit (1) # if all went well, write the fixed history to its original file history.write(outfile=history_path, format=1)
def print_random_tree(num_nodes=5): """ Doc Doc Doc """ t = Tree() t.populate(num_nodes) print("t", t) print("children", t.children) print("get_children", t.get_children()) print("up", t.up) print("name", t.name) print("dist", t.dist) print("is_leaf", t.is_leaf()) print("get_tree_root", t.get_tree_root()) print("children[0].get_tree_root", t.children[0].get_tree_root()) print("children[0].children[0].get_tree_root", t.children[0].children[0].get_tree_root()) for leaf in t: print(leaf.name)
class exponential_mixture: """ML search PTP, to use: __init__(), search() and count_species()""" def __init__( self, tree, sp_rate=0, fix_sp_rate=False, max_iters=20000, min_br=0.0001, ): self.min_brl = min_br self.tree = Tree(tree, format=1) self.tree.resolve_polytomy(recursive=True) self.tree.dist = 0.0 self.fix_spe_rate = fix_sp_rate self.fix_spe = sp_rate self.max_logl = float("-inf") self.max_setting = None self.null_logl = 0.0 self.null_model() self.species_list = None self.counter = 0 self.setting_set = set([]) self.max_num_search = max_iters def null_model(self): coa_br = [] all_nodes = self.tree.get_descendants() for node in all_nodes: if node.dist > self.min_brl: coa_br.append(node.dist) e1 = exp_distribution(coa_br) self.null_logl = e1.sum_log_l() return e1.rate def __compare_node(self, node): return node.dist def re_rooting(self): node_list = self.tree.get_descendants() node_list.sort(key=self.__compare_node) node_list.reverse() rootnode = node_list[0] self.tree.set_outgroup(rootnode) self.tree.dist = 0.0 def comp_num_comb(self): for node in self.tree.traverse(strategy="postorder"): if node.is_leaf(): node.add_feature("cnt", 1.0) else: acum = 1.0 for child in node.get_children(): acum = acum * child.cnt acum = acum + 1.0 node.add_feature("cnt", acum) return self.tree.cnt def next(self, sp_setting): self.setting_set.add(frozenset(sp_setting.spe_nodes)) logl = sp_setting.get_log_l() if logl > self.max_logl: self.max_logl = logl self.max_setting = sp_setting for node in sp_setting.active_nodes: if node.is_leaf(): pass else: childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in sp_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting( spe_nodes=sp_nodes, root=sp_setting.root, sp_rate=sp_setting.spe_rate, fix_sp_rate=sp_setting.fix_spe_rate, minbr=self.min_brl, ) if frozenset(sp_nodes) in self.setting_set: pass else: self.next(new_sp_setting) def H0(self, reroot=True): self.H1(reroot) self.H2(reroot=False) self.run_h3(reroot=False) def H1(self, reroot=True): if reroot: self.re_rooting() # self.init_tree() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting( spe_nodes=first_node_list, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting for node in sorted_node_list: if node not in last_setting.spe_nodes: curr_sp_nodes = [] for nod in last_setting.spe_nodes: curr_sp_nodes.append(nod) chosen_branching_node = (node.up ) # find the father of this new node if chosen_branching_node in last_setting.spe_nodes: for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) else: for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) while not chosen_branching_node.is_root(): chosen_branching_node = chosen_branching_node.up for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) if chosen_branching_node in last_setting.spe_nodes: break new_setting = species_setting( spe_nodes=curr_sp_nodes, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) new_logl = new_setting.get_log_l() if new_logl > max_logl: max_logl = new_logl max_setting = new_setting last_setting = new_setting else: """node already is a speciation node, do nothing""" pass if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def H2(self, reroot=True): """Greedy""" if reroot: self.re_rooting() # self.init_tree() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting( spe_nodes=first_node_list, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting contin_flag = True while contin_flag: curr_max_logl = float("-inf") curr_max_setting = None contin_flag = False for node in last_setting.active_nodes: if node.is_leaf(): pass else: contin_flag = True childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting( spe_nodes=sp_nodes, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def run_h3(self, reroot=True): if reroot: self.re_rooting() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() sorted_br = [] for node in sorted_node_list: sorted_br.append(node.dist) maxlogl = float("-inf") maxidx = -1 for i in range(len(sorted_node_list))[1:]: l1 = sorted_br[0:i] l2 = sorted_br[i:] e1 = exp_distribution(l1) e2 = exp_distribution(l2) logl = e1.sum_log_l() + e2.sum_log_l() if logl > maxlogl: maxidx = i maxlogl = logl target_nodes = sorted_node_list[0:maxidx] first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting( spe_nodes=first_node_list, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting contin_flag = True target_node_cnt = 0 while contin_flag: curr_max_logl = float("-inf") curr_max_setting = None contin_flag = False unchanged_flag = True for node in last_setting.active_nodes: if node.is_leaf(): pass else: contin_flag = True childs = node.get_children() sp_nodes = [] flag = False for child in childs: if child in target_nodes: flag = True # target_nodes.remove(child) if flag: unchanged_flag = False for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting( spe_nodes=sp_nodes, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if not unchanged_flag: target_node_cnt = target_node_cnt + 1 if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if len(target_nodes) == target_node_cnt: contin_flag = False if contin_flag and unchanged_flag and last_setting != None: for node in last_setting.active_nodes: if node.is_leaf(): pass else: childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting( spe_nodes=sp_nodes, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def Brutal(self, reroot=False): if reroot: self.re_rooting() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) num_s = self.comp_num_comb() if num_s > self.max_num_search: print("Too many search iterations: " + repr(num_s) + ", using H0 instead!!!") self.H0(reroot=False) else: first_setting = species_setting( spe_nodes=first_node_list, root=self.tree, sp_rate=self.fix_spe, fix_sp_rate=self.fix_spe_rate, minbr=self.min_brl, ) self.next(first_setting) def search(self, strategy="H1", reroot=False): if strategy == "H1": self.H1(reroot) elif strategy == "H2": self.H2(reroot) elif strategy == "H3": self.run_h3(reroot) elif strategy == "Brutal": self.Brutal(reroot) else: self.H0(reroot) def count_species(self, print_log=True, pv=0.001): lhr = lh_ratio_test(self.null_logl, self.max_logl, 1) pvalue = lhr.get_p_value() if print_log: print("Speciation rate: " + "{0:.3f}".format(self.max_setting.rate2)) print("Coalesecnt rate: " + "{0:.3f}".format(self.max_setting.rate1)) print("Null logl: " + "{0:.3f}".format(self.null_logl)) print("MAX logl: " + "{0:.3f}".format(self.max_logl)) print("P-value: " + "{0:.3f}".format(pvalue)) spefit, speaw = self.max_setting.e2.ks_statistic() coafit, coaaw = self.max_setting.e1.ks_statistic() print("Kolmogorov-Smirnov test for model fitting:") print("Speciation: " + "Dtest = {0:.3f}".format(spefit) + " " + speaw) print("Coalescent: " + "Dtest = {0:.3f}".format(coafit) + " " + coaaw) if pvalue < pv: num_sp, self.species_list = self.max_setting.count_species() return num_sp else: self.species_list = [] self.species_list.append(self.tree.get_leaf_names()) return 1 def whitening_search(self, strategy="H1", reroot=False, pv=0.001): self.search(strategy, reroot, pv) num_sp, self.species_list = self.max_setting.count_species() spekeep = self.max_setting.whiten_species() self.tree.prune(spekeep) self.max_logl = float("-inf") self.max_setting = None self.null_logl = 0.0 self.null_model() self.species_list = None self.counter = 0 self.setting_set = set([]) self.search(strategy, reroot, pv) def print_species(self): cnt = 1 for sp in self.species_list: print("Species " + repr(cnt) + ":") for leaf in sp: print(" " + leaf) cnt = cnt + 1 def output_species(self, taxa_order=[]): """taxa_order is a list of taxa names, the paritions will be output as the same order""" if len(taxa_order) == 0: taxa_order = self.tree.get_leaf_names() num_taxa = 0 for sp in self.species_list: for leaf in sp: num_taxa = num_taxa + 1 if not len(taxa_order) == num_taxa: print("error error, taxa_order != num_taxa!") return None, None else: partion = [-1] * num_taxa cnt = 1 for sp in self.species_list: for leaf in sp: idx = taxa_order.index(leaf) partion[idx] = cnt cnt = cnt + 1 return taxa_order, partion
from ete3 import Tree import sys t = Tree(sys.argv[1]) #print dir(t) tnode = t.get_children()[0] print tnode.name print tnode.children
def main(arg1,arg2): with open(arg1) as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] tree2=Tree(content[0]) ##print(t2) triplets=[] taxa=[] leaf_sets=[] for i in range(0,len(content)): taxa_tree=[] t2=Tree(content[i]) for leaf in t2: taxa_tree.append(leaf.name) leaf_sets.append(taxa_tree) taxa+=list(taxa_tree) tree1=Tree(content[i]) ##print("start_tree1",t1) ##print("start_tree2",t2) tree2=Tree(scm_last.scm(tree1,tree2)) ##print(tree2) merge_taxa=[] merge_count=0 merge_lookup={} for node in tree2.traverse("postorder"): # Do some analysis on node if (node.is_root() is False) and (node.is_leaf() is False): children=node.get_children() if len(children) >2: merge_taxa.append([]) for c in children: if c.is_leaf(): merge_taxa[merge_count].append(c.name) merge_lookup[c.name]=merge_count merge_count +=1 ##print(merge_taxa) ##print(merge_lookup) taxa=set(taxa) d = {ni: indi for indi, ni in enumerate(set(taxa))} inv_map = {v: k for k, v in d.items()} ##print(d,inv_map) a=0 counter=0 #a[:] = '?' for i in range(0,len(content)): t2=Tree(content[i]) missing_taxa=set(taxa)-set(leaf_sets[i]) tree_right=t2.get_children()[0] tree_left=t2.get_children()[1] temp_r,a,counter=make_mrp(tree_right,a,d,missing_taxa,counter) temp_l,a,counter=make_mrp(tree_left,a,d,missing_taxa,counter) connections=[] missing=[] bad_connect=[] ##print(len( a.T)) counter=0 int_nodes=range(1, len( a.T)+1) node=[] for i in int_nodes: node.append(str(i)) for column in a.T: ##print("#print column",column) counter +=1 for i in range(len(column)): if column[i]== 1: ##print(inv_map[i]) #if inv_map[i] in merge_lookup: ##print("look up is", merge_taxa[merge_lookup[inv_map[i]]]) connections.append([str(counter),inv_map[i]]) connections.append([inv_map[i],str(counter)]) elif column[i]== 2: ##print(inv_map[i]) missing.append([str(counter),inv_map[i]]) else: bad_connect.append([str(counter),inv_map[i]]) ''' semi=[] counter=0 for column in a.T: counter +=1 ##print(column) if(np.prod(column)!=0): semi.append(counter)''' ##print(connections) g=Graph(connections,missing,bad_connect,node) ##print(g._graph_good) #hs_size=len(d)+counter-len(semi) tree_new=Tree() semi=find_semiUni(g) for sem in semi: g.delete_semi(str(sem)) ##print("semi:",sem) ##print(g._graph_good[]) ##print("plot_start",time.time()) clades,clades_connects,all_clades=connections_div(g,taxa) ##print("plot_stop",time.time()) ##print("all is",all_clades) ##print(len(clades)) if len(clades)==1: g_HS=deepcopy(g) all_clades,clades,clades_connects=plot_HS(g_HS,taxa) g_HS=None #print(clades) #print(clades_connects) ##print(g._graph_good) ##print("all clade",clades) newick="" for ci in range(len(clades_connects)): clade=clades_connects[ci] ci_taxa=clades[ci] g_ci=deepcopy(g) delete_clade=set(all_clades)-clade[0] delete_leaves=taxa-ci_taxa[0] #print(delete_leaves) for c in delete_clade: ##print("cladestep",c) g_ci.delete_connections(c) for l in delete_leaves: g_ci.delete_leaf_complete(l) ##print("deleted clades",delete_clade) ##print("deltetd leaes",delete_leaves) ##print("clades keep",clade[0]) ##print("leaves keep",ci_taxa[0]) ##print(g_ci._graph_good) #BCD(g_ci,ci_taxa[0]) #tree=BCD(g_ci,ci_taxa[0]) newick+=BCD(g_ci,ci_taxa[0]) #if tree != "error": #tree_new.add_child(tree) newick="("+ newick+");" print(newick) tree_new.show()
class Phylo_Tree_Drawer(): # Ranks we always want to show sig_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'species'] def __init__(self, ncbi): self.t = Tree() self.style = TreeStyle() self.style.show_leaf_name = False self.style.show_scale = False self.ncbi = ncbi self.is_collapsed = False def add_new_lineage(self, lineage, num_reads): ''' Add lineage to tree :param lineage: lineage array as returned by NCBITaxa.get_lineage() :param num_reads: the number of reads you are classifying as part of the lineage (additive) ''' if self.is_collapsed: raise RuntimeError("You may not add new lineages after drawing or creating directories") def recursive_helper(parent, lineage): if lineage[0] in [x.name for x in parent.get_children()]: child = parent.search_nodes(name=lineage[0])[0] else: child = parent.add_child(name=lineage[0]) if len(lineage) > 1: recursive_helper(child, lineage[1:]) elif 'num_reads' not in child.features: child.add_feature('num_reads', num_reads) else: child.num_reads = child.num_reads + num_reads recursive_helper(self.t, lineage) def draw(self, full_tree_path, simplified_tree_path, significance_ratio): ''' Output full tree image at full_tree_path. Output simplified tree image at simplified_tree_path. Significance_ratio is the percent abundance needed to be considered significant. (Significant nodes are highlighted and the is a simplified tree image to show significant nodes.) Only call after you are done adding lineages. Requires X server ''' self.ncbi.annotate_tree(self.t) if not self.is_collapsed: self._collapse_tree() self.is_collapsed = True # calculate total # of reads total_reads = 0 for node in self.t.traverse(): if 'num_reads' in node.features: total_reads += node.num_reads sig_threshold = round(total_reads * significance_ratio) self._add_text_faces(sig_threshold, total_reads) # Draw full tree image self.t.render(full_tree_path, w=35, units='in', tree_style=self.style) # Draw simplified tree image self._collapse_tree(sig_threshold) self.t.render(simplified_tree_path, w=35, units='in', tree_style=self.style) def create_directories(self, path, sequence_dict): ''' This function will create a directory tree in the shape of the phylogenetic tree and deposit sequences there :param path: Root directory path ''' self.ncbi.annotate_tree(self.t) if not self.is_collapsed: self._collapse_tree() self.is_collapsed = True def create_folder(path): head, tail = os.path.split(path) if head and not os.path.isdir(head): create_folder(head) if not os.path.isdir(path): os.mkdir(path) def slugify(value): """ Convert spaces to underscores. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace. """ value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value).strip().lower() return re.sub(r'[-\s]+', '_', value) def recursive_helper(node, *folders): name_slug = slugify(node.sci_name) full_path = os.path.join(path, *folders, name_slug) create_folder(full_path) # If sequences belong to this taxon, deposit them here if node.taxid in sequence_dict: SeqIO.write( sequence_dict[node.taxid], os.path.join(full_path, '%s_sequences.fasta' % name_slug), 'fasta' ) for child in node.get_children(): recursive_helper(child, *folders, name_slug) for top_level_node in self.t.get_children(): recursive_helper(top_level_node) def _add_text_faces(self, highlight_treshold, total_reads): ''' Add labels to the image :param highlight_treshold: If node has over this number of reads, highlight the textface ''' for node in self.t.traverse(): label_text = '%s\nRank: %s' % (node.sci_name, node.rank) num_reads = 0 if 'num_reads' in node.features: num_reads = node.num_reads label_text = '%s\nNumber of Reads: %i' % (label_text, num_reads) if num_reads >= highlight_treshold: label_text = '%s\nAbundance: %s' % (label_text, '{:.2%}'.format(num_reads/total_reads)) face = TextFace(label_text) if num_reads >= highlight_treshold: face.background.color = "Moccasin" node.add_face(face, column=0) def _collapse_tree(self, min_reads=1): ''' Remove nodes which do not have at least 'min_reads'(int) assigned and are not a significant rank ''' def recursive_helper(node): children = node.get_children() num_reads = 0 if 'num_reads' in node.features: num_reads = node.num_reads has_sig_ancestor = False for child in children: if recursive_helper(child): has_sig_ancestor = True if num_reads >= min_reads: return True if not (node.rank in self.sig_ranks and has_sig_ancestor): node.delete(prevent_nondicotomic=False) return has_sig_ancestor for child in self.t.get_children(): recursive_helper(child)
from ete3 import Tree t = Tree() # We create a random tree topology t.populate(15) print t print t.children print t.get_children() print t.up print t.name print t.dist print t.is_leaf() print t.get_tree_root() print t.children[0].get_tree_root() print t.children[0].children[0].get_tree_root() # You can also iterate over tree leaves using a simple syntax for leaf in t: print leaf.name
# NB. using the node.prune() function is too slow # actualize "mergedInd" feature of new leaf newLeaf[0].mergedInd = mergedLeaves else: # populate "mergedInd" feature for future SFS mergedLeaves = "" for l in node.iter_leaves(): mergedLeaves = mergedLeaves+" "+l.name # collapse the subtree newLeaf = t.get_farthest_leaf() for child in t.get_children(): child.detach() node.add_child(newLeaf[0], newLeaf[0].name, newLeaf[1]) # actualize "mergedInd" feature of new leaf newLeaf[0].mergedInd = mergedLeaves nTrueSpecies = len(t) sys.stdout.write('C') #======================================================# # COMPUTE & EXPORT the SFS (full names & numerical) f = open(osfs+"_allinds.txt", 'w+') fn = open(osfs, 'w+') fn.write("Tip_label\t" + "\t".join("Isl_"+str(x) for x in range(0, nPops)) + "\n")
f.write('BINX, sc=' + str(x + 1) + '-' + str(x + y) + '\n') fCcScMtx = pd.concat([fCcMtx, fScMtx], axis=1) with open('families/' + fam + '.cc_sc.phy', 'w') as f: f.write(' '.join(map(str, np.shape(fCcScMtx))) + '\n') for l in fCcScMtx.index: f.write(l.ljust(pad)) rw = np.array(fCcScMtx.loc[l].values, str) rw[rw == '-1'] = '-' f.write(''.join(rw) + '\n') cmd = raxmlCommand + " -T 20 -g " + fam cmd += ".tre -c 4 -m BINGAMMAX -s " + fam cmd += ".cc_sc.phy -q " + fam + ".part.txt -n " + fam + "_rooted -p 12345" p = subprocess.Popen(cmd, shell=True, cwd=os.getcwd() + '/families/') os.waitpid(p.pid, 0) fTree = Tree('families/RAxML_bestTree.' + fam + '_rooted') fTree.resolve_polytomy() fTree.set_outgroup(fTree & outgroup) (fTree & outgroup).delete(preserve_branch_length=True) p = subprocess.Popen("rm -f RAxML*", shell=True, cwd=os.getcwd() + '/families/') os.waitpid(p.pid, 0) p = subprocess.Popen("rm -f " + fam + ".cc_sc.phy* " + fam + ".part.txt", shell=True, cwd=os.getcwd() + '/families/') os.waitpid(p.pid, 0) fTree.get_children()[0].write(outfile="outgroupRooted/" + fam + ".outgroupRooted.tre", format=1)
repeat_tree_file = sys.argv[1] #IQ tree output to be rooted gene_tree_file = sys.argv[2] #nhx tree, subtree generated by ensembl_api.py input_folder = 'genetrees_nhx' output_folder = 'denovo/treefix' outputs_file = gene_tree_file.replace(input_folder, output_folder) smap_file = outputs_file.replace('.nhx', '.smap') stree_file = outputs_file.replace('.nhx', '.stree') rooted_file = repeat_tree_file + '.rooted' #write rooted tree with open(repeat_tree_file, 'r') as tree_string: tree_string = tree_string.readline() repeat_tree = Tree(tree_string, format=1) topnode = repeat_tree.get_children()[0] repeat_tree.set_outgroup(topnode) repeat_tree.write(format=1, outfile=rooted_file) with open(gene_tree_file, 'r') as gene_tree: gene_tree_str = gene_tree.read() gene_tree = Tree(gene_tree_str, format=1) with open(smap_file, 'w') as smap: for l in gene_tree.get_leaves(): smap.write(l.name + '*\t' + l.name + '\n') with open(stree_file, 'w') as stree: stree.write(gene_tree_str.replace('[&&NHX:D=D]', ''))
def parse_biopp_history(history_path, base_tree_path): base_tree = Tree(base_tree_path, format=1) node_data_regex = re.compile("([^(|)]*?)\{(\d)\}") # read the tree from the file history = Tree(history_path, format=1) # root the history in the same location as the base tree root_child = base_tree.get_children()[0] for node in history.traverse("postorder"): if root_child.name in node.name: history.set_outgroup(node) break for node in history.traverse("postorder"): if node != history.get_tree_root(): node_name = (node_data_regex.search(node.name)).group(1) node_state = (node_data_regex.search(node.name)).group(2) if "missing_node" in node_name: parent = base_tree.search_nodes( name=node.get_children()[0].name)[0].up node_name = parent.name node.name = node_name if node_state == "0": node.add_feature("label", "0") else: node.add_feature("label", "1") else: # check if the root has more than 2 children, and if yes, reroot the tree in accordance with the base tree if len(node.get_children()) > 2: original_children = base_tree.get_tree_root().get_children() current_children = history.get_children() missing_node = original_children[0] missing_children = [] in_curr = False for orig_node in original_children: for curr_node in current_children: if orig_node.name in curr_node.name: in_curr = True if not in_curr: missing_node = orig_node in_curr = False apparent_node = [ node for node in original_children if not node == missing_node ][0] missing_children = [ node for node in current_children if not apparent_node.name in node.name ] new = history.get_tree_root().add_child(child=None, name=missing_node.name, dist=missing_node.dist, support=None) # now make the two extra children of the current root the children of the new node for child in missing_children: child.detach() new.add_child(child=child) # set the label as pwe the mp solution if new.get_children()[0].label == new.get_children()[1].label: new.add_feature("label", new.get_children()[0].label) else: apparent_node_in_history = \ [node for node in history.get_tree_root().get_children() if apparent_node.name in node.name][0] new.add_feature("label", apparent_node_in_history.label) node.add_feature("label", new.label) # root is always BG else: node.add_feature("label", node.get_children()[0].label) history.get_tree_root().name = "root" # lastly, set base internal nodes names according to the base tree for node in history.traverse("postorder"): if "base" in node.name: # get the respective base node from the base tree based on the already treated children in the history child = node.get_children()[0] while "mapping" in child.name: child = child.get_children()[0] base_parent = base_tree.search_nodes( name=child.name.rstrip())[0].up node.name = base_parent.name return history
class exponential_mixture: """ML search PTP, to use: __init__(), search() and count_species()""" def __init__(self, tree, sp_rate = 0, fix_sp_rate = False, max_iters = 20000, min_br = 0.0001): self.min_brl = min_br self.tree = Tree(tree, format = 1) self.tree.resolve_polytomy(recursive=True) self.tree.dist = 0.0 self.fix_spe_rate = fix_sp_rate self.fix_spe = sp_rate self.max_logl = float("-inf") self.max_setting = None self.null_logl = 0.0 self.null_model() self.species_list = None self.counter = 0 self.setting_set = set([]) self.max_num_search = max_iters def null_model(self): coa_br = [] all_nodes = self.tree.get_descendants() for node in all_nodes: if node.dist > self.min_brl: coa_br.append(node.dist) e1 = exp_distribution(coa_br) self.null_logl = e1.sum_log_l() return e1.rate def __compare_node(self, node): return node.dist def re_rooting(self): node_list = self.tree.get_descendants() node_list.sort(key=self.__compare_node) node_list.reverse() rootnode = node_list[0] self.tree.set_outgroup(rootnode) self.tree.dist = 0.0 def comp_num_comb(self): for node in self.tree.traverse(strategy='postorder'): if node.is_leaf(): node.add_feature("cnt", 1.0) else: acum = 1.0 for child in node.get_children(): acum = acum * child.cnt acum = acum + 1.0 node.add_feature("cnt", acum) return self.tree.cnt def next(self, sp_setting): self.setting_set.add(frozenset(sp_setting.spe_nodes)) logl = sp_setting.get_log_l() if logl > self.max_logl: self.max_logl = logl self.max_setting = sp_setting for node in sp_setting.active_nodes: if node.is_leaf(): pass else: childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in sp_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting(spe_nodes = sp_nodes, root = sp_setting.root, sp_rate = sp_setting.spe_rate, fix_sp_rate = sp_setting.fix_spe_rate, minbr = self.min_brl) if frozenset(sp_nodes) in self.setting_set: pass else: self.next(new_sp_setting) def H0(self, reroot = True): self.H1(reroot) self.H2(reroot = False) self.H3(reroot = False) def H1(self, reroot = True): if reroot: self.re_rooting() #self.init_tree() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting for node in sorted_node_list: if node not in last_setting.spe_nodes: curr_sp_nodes = [] for nod in last_setting.spe_nodes: curr_sp_nodes.append(nod) chosen_branching_node = node.up #find the father of this new node if chosen_branching_node in last_setting.spe_nodes: for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) else: for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) while not chosen_branching_node.is_root(): chosen_branching_node = chosen_branching_node.up for nod in chosen_branching_node.get_children(): if nod not in curr_sp_nodes: curr_sp_nodes.append(nod) if chosen_branching_node in last_setting.spe_nodes: break new_setting = species_setting(spe_nodes = curr_sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) new_logl = new_setting.get_log_l() if new_logl> max_logl: max_logl = new_logl max_setting = new_setting last_setting = new_setting else: """node already is a speciation node, do nothing""" pass if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def H2(self, reroot = True): """Greedy""" if reroot: self.re_rooting() #self.init_tree() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting contin_flag = True while contin_flag: curr_max_logl = float("-inf") curr_max_setting = None contin_flag = False for node in last_setting.active_nodes: if node.is_leaf(): pass else: contin_flag = True childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def H3(self, reroot = True): if reroot: self.re_rooting() sorted_node_list = self.tree.get_descendants() sorted_node_list.sort(key=self.__compare_node) sorted_node_list.reverse() sorted_br = [] for node in sorted_node_list: sorted_br.append(node.dist) maxlogl = float("-inf") maxidx = -1 for i in range(len(sorted_node_list))[1:]: l1 = sorted_br[0:i] l2 = sorted_br[i:] e1 = exp_distribution(l1) e2 = exp_distribution(l2) logl = e1.sum_log_l() + e2.sum_log_l() if logl > maxlogl: maxidx = i maxlogl = logl target_nodes = sorted_node_list[0:maxidx] first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) last_setting = first_setting max_logl = last_setting.get_log_l() max_setting = last_setting contin_flag = True target_node_cnt = 0 while contin_flag: curr_max_logl = float("-inf") curr_max_setting = None contin_flag = False unchanged_flag = True for node in last_setting.active_nodes: if node.is_leaf(): pass else: contin_flag = True childs = node.get_children() sp_nodes = [] flag = False for child in childs: if child in target_nodes: flag = True #target_nodes.remove(child) if flag: unchanged_flag = False for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if not unchanged_flag: target_node_cnt = target_node_cnt + 1 if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if len(target_nodes) == target_node_cnt: contin_flag = False if contin_flag and unchanged_flag and last_setting!= None: for node in last_setting.active_nodes: if node.is_leaf(): pass else: childs = node.get_children() sp_nodes = [] for child in childs: sp_nodes.append(child) for nod in last_setting.spe_nodes: sp_nodes.append(nod) new_sp_setting = species_setting(spe_nodes = sp_nodes, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) logl = new_sp_setting.get_log_l() if logl > curr_max_logl: curr_max_logl = logl curr_max_setting = new_sp_setting if curr_max_logl > max_logl: max_setting = curr_max_setting max_logl = curr_max_logl last_setting = curr_max_setting if max_logl > self.max_logl: self.max_logl = max_logl self.max_setting = max_setting def Brutal(self, reroot = False): if reroot: self.re_rooting() first_node_list = [] first_node_list.append(self.tree) first_childs = self.tree.get_children() for child in first_childs: first_node_list.append(child) num_s = self.comp_num_comb() if num_s > self.max_num_search: print("Too many search iterations: " + repr(num_s) + ", using H0 instead!!!") self.H0(reroot = False) else: first_setting = species_setting(spe_nodes = first_node_list, root = self.tree, sp_rate = self.fix_spe, fix_sp_rate = self.fix_spe_rate, minbr = self.min_brl) self.next(first_setting) def search(self, strategy = "H1", reroot = False): if strategy == "H1": self.H1(reroot) elif strategy == "H2": self.H2(reroot) elif strategy == "H3": self.H3(reroot) elif strategy == "Brutal": self.Brutal(reroot) else: self.H0(reroot) def count_species(self, print_log = True, pv = 0.001): lhr = lh_ratio_test(self.null_logl, self.max_logl, 1) pvalue = lhr.get_p_value() if print_log: print("Speciation rate: " + "{0:.3f}".format(self.max_setting.rate2)) print("Coalesecnt rate: " + "{0:.3f}".format(self.max_setting.rate1)) print("Null logl: " + "{0:.3f}".format(self.null_logl)) print("MAX logl: " + "{0:.3f}".format(self.max_logl)) print("P-value: " + "{0:.3f}".format(pvalue)) spefit, speaw = self.max_setting.e2.ks_statistic() coafit, coaaw = self.max_setting.e1.ks_statistic() print("Kolmogorov-Smirnov test for model fitting:") print("Speciation: " + "Dtest = {0:.3f}".format(spefit) + " " + speaw) print("Coalescent: " + "Dtest = {0:.3f}".format(coafit) + " " + coaaw) if pvalue < pv: num_sp, self.species_list = self.max_setting.count_species() return num_sp else: self.species_list = [] self.species_list.append(self.tree.get_leaf_names()) return 1 def whitening_search(self, strategy = "H1", reroot = False, pv = 0.001): self.search(strategy, reroot, pv) num_sp, self.species_list = self.max_setting.count_species() spekeep = self.max_setting.whiten_species() self.tree.prune(spekeep) self.max_logl = float("-inf") self.max_setting = None self.null_logl = 0.0 self.null_model() self.species_list = None self.counter = 0 self.setting_set = set([]) self.search(strategy, reroot, pv) def print_species(self): cnt = 1 for sp in self.species_list: print("Species " + repr(cnt) + ":") for leaf in sp: print(" " + leaf) cnt = cnt + 1 def output_species(self, taxa_order = []): """taxa_order is a list of taxa names, the paritions will be output as the same order""" if len(taxa_order) == 0: taxa_order = self.tree.get_leaf_names() num_taxa = 0 for sp in self.species_list: for leaf in sp: num_taxa = num_taxa + 1 if not len(taxa_order) == num_taxa: print("error error, taxa_order != num_taxa!") return None, None else: partion = [-1] * num_taxa cnt = 1 for sp in self.species_list: for leaf in sp: idx = taxa_order.index(leaf) partion[idx] = cnt cnt = cnt + 1 return taxa_order, partion