def improve_tree_from_align (tree, align, if_tre = None, of_tre = None, a_file = None, prefix = "/tmp/", n_threads = 4, params=None): if (tree is None) and (if_tre is None): print ("ERROR: You must give me a tree object or file") if (align is None) and (a_file is None): print ("ERROR: You must give me an alignment or file") if params is None: params = "-m HKY+G -me 0.05 -blmin 0.000005 -blmax 4" if prefix is None: prefix = "./" if if_tre is None: ifl = prefix + "in_iqtre.tre" else: ifl = if_tre if of_tre is None: ofl = prefix + "out_iqtre.tre" else: ofl = of_tre if a_file is None: aln = prefix + "seq.aln" else: aln = a_file if align: SeqIO.write(align, aln, "fasta") ## else it should be present in infile if tree: tree.write(format=1, outfile=ifl) ## to recycle the file make sure tree and align are None n_threads = str (n_threads) runstr = f"iqtree -g {ifl} -s {aln} {params} -ninit 1 -nt {n_threads} -redo; mv {aln}.treefile {ofl}" proc_run = subprocess.check_output(runstr, shell=True, universal_newlines=True) treestring = open(ofl).readline().rstrip().replace("\'","").replace("\"","").replace("[&R]","") tree_out = ete3.Tree (treestring) # os.system(f"rm -f {aln}.* ifl") # if a_file is None: os.system(f"rm -f {aln}") # if of_tre is None: os.system(f"rm -f {ofl}") return tree_out
def check_supersets(tree): if tree.is_leaf(): return False moved = False for c1 in tree.children: for c2 in tree.children: if c1 == c2: continue if c1.mutations.issubset(c2.mutations): c1.detach() c2.add_child(c1) moved = True elif c2.mutations.issubset(c1.mutations): c2.detach() c1.add_child(c2) moved = True overlap = c1.mutations.intersection(c2.mutations) if len(overlap) > 0: c1.detach() c2.detach() intermediate = instantiate_node(ete3.Tree(name='NoName')) intermediate.mutations = overlap intermediate.add_child(c1) intermediate.add_child(c2) tree.add_child(intermediate) moved = moved or check_supersets(c1) return moved
def grab_trees(self, db=None, collapse=True): if (db is None): db = tables.open_file(self.h5name, mode="r") close = True else: close = False trees = [ ete3.Tree(tree.decode('utf-8')) for tree in db.get_node("/" + self.scanName + "/trees", classname="Array") ] intvals = [(x[0], x[1]) for x in db.get_node("/" + self.scanName + "/ivals", classname="Array")] if (close): db.close() if (collapse): for tree in trees: for node in tree.get_descendants(): if (not node.is_leaf() and node.dist < self.branch_cutoff): node.delete() return trees, intvals
def load_tree(self, nhx_fn): """Load an ete3 tree """ self.tree = ete3.Tree(nhx_fn, format=1) for i, n in enumerate(self.tree.traverse()): if n.name == "": n.name = str("node_{}".format(i))
def get_tree_with_famsizes(self): self.fam_sizes = [] size_tree = ete3.Tree(self.nwk_famsize_str) self.tree = copy.deepcopy(self.c.tree) # parse family sizes: for node, size_tree_node in zip( self.tree.traverse(), size_tree.traverse() ): if size_tree_node.is_leaf(): node.fam_size = int(size_tree_node.name.split('_')[1]) else: node.fam_size = int(size_tree_node.support) self.fam_sizes.append(node.fam_size) node.event = None # parse family pvalues: node_pvalues = re.findall(r'[\d\.]+|-', self.branch_pvalue_str) for node_id, node_size in zip(self.c.cafe_node_id_order, node_pvalues): nodes = self.tree.search_nodes(id=node_id) assert len(nodes) == 1 node = nodes[0] if node_size == '-' or self.pvalue > self.c.family_p_cutoff: node.pvalue = None else: node.pvalue = float(node_size) if node.pvalue <= self.c.branch_p_cutoff: if node.fam_size > node.up.fam_size: node.event = '+' elif node.fam_size < node.up.fam_size: node.event = '-' return
def fit_clustering_model(args): logger.info("Started estimating clustering model parameters " "for each organism") lambdas, phis = estimate_parameters.estimate_individual_parameters(args) logger.info("Computing global mean estimated parameter values") if args["weight"] is True: logger.info("Importing phylogenetic tree %s", args["tree_file"]) tree = ete3.Tree(args["tree_file"]) logger.info("Phylogenetic tree %s imported", args["tree_file"]) organism_weights = GSC.GSC_normalised(tree) lambd = estimate_parameters.compute_weighted_means( lambdas, organism_weights) phi = estimate_parameters.compute_weighted_means( phis, organism_weights) else: lambd = np.mean(lambdas.values()) phi = np.mean(phis.values()) args["lambd"], args["phi"] = lambd, phi logger.info("Global mean estimated parameter " "values: lambda={:.3g}, phi={:.3g}".format( args["lambd"], args["phi"])) import_export_parameters. \ save_general_parameters(lambd, phi, args["general_parameters_filename"]) logger.info("Global mean estimated parameter values saved in file %s", args["general_parameters_filename"]) return lambd, phi
def distances_between_roots(folder): trees = [] count = 1 while os.path.isfile('%s/%s.optResolution%i.ranger_input' % (folder, folder, count)): trees.append( ete3.Tree( open('%s/%s.optResolution%i.ranger_input' % (folder, folder, count)).readlines()[1])) count += 1 root_distances = [] for tree1, tree2 in combinations(trees, 2): for child in tree2.children: if child.is_leaf(): new_root = tree1.get_leaves_by_name(name=child.name)[0] break else: is_it_monophyletic, clade_type, fucking_up = tree1.check_monophyly( child.get_leaf_names(), 'name', unrooted=False) if is_it_monophyletic or child.is_leaf(): new_root = tree1.get_common_ancestor( child.get_leaf_names()) break root_distances.append(tree1.get_distance(new_root, topology_only=True)) return root_distances
def find_omitted_seqs_in_al(genetreelist, ancestor, rootdir='.', subtreesdir='subtreesGoodQualO2'): count_ok = 0 count_bad = 0 for alfile, subtree, genetree in iter_glob_subtree_files( genetreelist, ancestor, '_genes.fa', rootdir, subtreesdir): al = AlignIO.read(alfile, format='fasta') #al = ungap(al) subtreefile = alfile.replace('_genes.fa', '.nwk') tree = ete3.Tree(subtreefile, format=1) al_seqnames = set([record.name for record in al]) tree_leaves = set(tree.get_leaf_names()) al_lacking = al_seqnames - tree_leaves tree_lacking = tree_leaves - al_seqnames if al_lacking: count_bad += 1 print('%s:AL lacks: %s' % (subtree, ' '.join(al_lacking))) elif tree_lacking: count_bad += 1 print('%s:TREE lacks: %s' % (subtree, ' '.join(al_lacking))) else: count_ok += 1 if al_lacking and tree_lacking: print('AL & TREE mismatched!!!') print('%s ok; %s bad' % (count_ok, count_bad))
def root_tree_in_position(tree_file, sp1, sp2): with open(tree_file) as f: t = ete3.Tree(f.readline().strip(), format=1) if len(t.get_children()) != 2: print( "Tree is not rooted. Please, use this tree as input and run this script again" ) t.set_outgroup(t.get_leaves()[0]) print(t.write(format=1)) return None try: nsp1 = t & sp1 except: print("Could not find %s in tree" % sp1) return None try: nsp2 = t & sp2 except: print("Could not find %s in tree" % sp2) return None ca = t.get_common_ancestor(nsp1, nsp2) try: t.set_outgroup(ca) except: print("Cannot root there! Is it already rooted in that branch?") return None print(t.write(format=1))
def read_newick(newick_path): newick_file = open(newick_path) newick = newick_file.read().strip() newick_file.close() tree = ete3.Tree(newick) return tree
def recursive_generate_topologies_bifurcating_unique(neighbour_list = ['1', '2', '3'], tree_list = [], tree_id_set_ete = set()): """ This generates all possible bifurcating trees for the neighbour_list of *non-repeating* taxa. It uses the ete3 method get_topology_id to ensure that only unique topology trees are kept. It only works in reasonable time if len(neighbout_list) is <=7 or perhaps 8. """ if len(neighbour_list) == 2: new_tree = "(%s,%s);" %(neighbour_list[0], neighbour_list[1]) new_tree_ete_id = ete3.Tree(newick = new_tree).get_topology_id() if new_tree_ete_id not in tree_id_set_ete: tree_list.append(new_tree) tree_id_set_ete.add(new_tree_ete_id) else: for i in xrange(len(neighbour_list)): for j in xrange(i): remaining = copy.copy(neighbour_list) if i > j: left = remaining.pop(remaining.index(neighbour_list[i])) right = remaining.pop(remaining.index(neighbour_list[j])) else: left = remaining.pop(remaining.index(neighbour_list[j])) right = remaining.pop(remaining.index(neighbour_list[i])) recursive_generate_uniquetopologies(["(%s,%s)" %(left, right)] + remaining, tree_list = tree_list, tree_id_set_ete = tree_id_set_ete) if len(tree_list) % 1000 == 0: print "Generated %d trees" %(len(tree_list)) return tree_list
def parse_and_clean_tree(n_pairs, args): subclades_final_probabilities_files = [args[ "final_probabilities_filename"]] + \ file_utilities.get_file_list( args["subclade_clustering_dir"], args[ "final_probabilities_pattern"], verbose=True) processed_files = load_processed_files_list( len(subclades_final_probabilities_files), args) already_processed = np.sum(processed_files) if (len(subclades_final_probabilities_files) - already_processed) > 0: logger.info( "%s subclade-specific conserved clustering " "probabilities files have to be parsed", len(subclades_final_probabilities_files) - already_processed) processed_files = manager.list(processed_files) tree = ete3.Tree(args["labelled_tree_file"], format=1) subclades_to_process = [ node.name for node in tree.traverse('levelorder') ] multi = MyMultiProcess(threads=args["threads"], target=process_child_node, input=subclades_to_process, args=[tree, processed_files, n_pairs, args], destroy=True) multi.run()
def preorder(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() for node in tree.preorder_node_iter(): pass t_end = time() elif m == 'biophylo': tree = Phylo.read(treeio, 'newick') t_start = time() for node in tree.find_clades(order='preorder'): pass t_end = time() elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() for node in tree.traverse_preorder(): pass t_end = time() elif m == 'ete3': tree = ete3.Tree(treestr,format=1) t_start = time() for node in tree.traverse(strategy='preorder'): pass t_end = time() else: assert False, "Invalid tool: %s"%m return t_end-t_start
def mrca(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() leaves = {l.taxon for l in tree.leaf_node_iter()} tree.mrca(taxa=leaves) t_end = time() elif m == 'biophylo': tree = Phylo.read(treeio, 'newick') t_start = time() leaves = tree.get_terminals() tree.common_ancestor(leaves) t_end = time() elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() leaves = {str(l) for l in tree.traverse_leaves()} tree.mrca(leaves) t_end = time() elif m == 'ete3': tree = ete3.Tree(treestr,format=1) t_start = time() leaves = tree.get_leaf_names() tree.get_common_ancestor(leaves) t_end = time() else: assert False, "Invalid tool: %s"%m return t_end-t_start
def test_mark_dups_below_v2(self): seq_ids = defaultdict(str) sp_ids = defaultdict(str) sp_tree, sp_tree_node_names = species_tree() hogw = t2o.HogWriter(sp_tree, sp_tree_node_names, seq_ids, sp_ids) empty_set = set() # act-assert gt = gene_tree() gt = hogw.mark_dups_below_v2(gt) self.assertTrue( all(n.dups_below == empty_set for n in gt.traverse() if not n.is_leaf())) # act-assert gt = gene_tree_dup_at_root_N12() gt = hogw.mark_dups_below_v2(gt) # root has a duplication self.assertEqual({'N12'}, gt.dups_below) # none of the other nodes do self.assertTrue( all(n.dups_below == empty_set for n in gt.traverse() if not (n.is_leaf() or n.is_root()))) # act-assert gt = ete3.Tree("((1_0, (12_0, 7_7),(5_0,(12_1, 6_0)));") gt = hogw.mark_dups_below_v2(gt) self.assertEqual({'N3'}, gt.dups_below)
def from_ClusterNode(root): """ Converts a scipy.cluster.hierarchy.ClusterNode object into an ETE Tree object. Parameters ---------- root : scipy.cluster.hierarchy.ClusterNode instance ClusterNode instance to convert to ETE Tree object. Returns ------- tree : ete3.Tree instance New tree. """ if root is None: return None # create copy of root node ete3_node = ete3.Tree(name=root.get_id(), dist=root.dist) # recursively create clone of left and right sub tree if root.get_left(): new_node = from_ClusterNode(root.get_left()) ete3_node.add_child(new_node) if root.get_right(): new_node = from_ClusterNode(root.get_right()) ete3_node.add_child(new_node) return ete3_node
def main(argv): if len(argv) != 3: print "invalid arguments" print "please specify the prism adversary file location and then the prism state file" else: adversaryTransitionDict = {} with open(argv[1], 'r') as fin: for line in fin: lineItems = line.split() if len(lineItems) == 4: a = AdversaryTransition(lineItems[0], lineItems[1], lineItems[2], lineItems[3]) if lineItems[0] in adversaryTransitionDict: adversaryTransitionDict[lineItems[0]].append(a) else: adversaryTransitionDict[lineItems[0]] = [a] with open(argv[2], 'r') as fin: for line in fin: if line.find('(1,1,1,0,0,0,4,4,4,false,0)') != -1: lineItems = line.split(':') startingState = lineItems[0] print startingState t = ete3.Tree() buildTreeDepthFirst(adversaryTransitionDict, t, startingState) t.show()
def validate_newick(newick): """Validates a Newick string by attempting to make a tree with ete3""" try: ete3.Tree(newick) except NewickError: return False return True
def allTopos(branches, _topos=None, _topo_IDs=None): if _topos is None or _topo_IDs is None: _topos = [] _topo_IDs = set([]) assert 4 <= len( branches) <= 8, "Please specify between 4 and 8 unique taxon names." #print "topos contains", len(_topos), "topologies." #print "current tree is:", branches for x in range(len(branches) - 1): for y in range(x + 1, len(branches)): #print "Joining branch", x, branches[x], "with branch", y, branches[y] new_branches = list(branches) new_branches[x] = [new_branches[x], new_branches.pop(y)] #print "New tree is:", new_branches if len(new_branches) == 3: #print "Tree has three branches, so appending to topos." #now check that the topo doesn't match a topology already in trees, and if not add it t = ete3.Tree(listToNwk(new_branches)) ID = t.get_topology_id() if ID not in _topo_IDs: _topos.append(t) _topo_IDs.add(ID) else: #print "Tree still unresolved, so re-calling function." _topos = allTopos(new_branches, _topos, _topo_IDs) #print _topo_IDs #print [t.write(format=9) for t in _topos] return (_topos)
def parse_newick(newick_file: str, remove_zero_edges: bool = True, epsilon: float = _EPSILON) -> NumpyRootedTree: """Return leaves followed by nodes (postorder)""" t = ete3.Tree(newick_file) ordered_nodes = sorted(t.traverse("postorder"), key=lambda n: not n.is_leaf()) indices = {n: i for i, n in enumerate(ordered_nodes)} parent_indices = np.array([indices[n.up] for n in ordered_nodes[:-1]]) root_distances = np.array([t.get_distance(n) for n in ordered_nodes], dtype=DEFAULT_FLOAT_DTYPE_NP) # TODO: Optimise root_height = max(root_distances) heights = root_height - root_distances taxon_count = (len(ordered_nodes) + 1) // 2 taxon_set = DictTaxonSet([x.name for x in ordered_nodes[:taxon_count]]) tree = NumpyRootedTree( heights=heights, parent_indices=parent_indices, taxon_set=taxon_set, ) if remove_zero_edges: tree = _remove_zero_edges_func(tree, epsilon=epsilon) return tree
def generate_history(self): """ Generate Forest composed of Trees which replay the history of this Tree """ trees_forward = [] # list of trees which will become the Forest T_forward = ete3.Tree() # initialize as empty tree for node_original in itertools.islice(self.T.traverse("levelorder"), 1, None): node = copy.deepcopy( node_original ) # copy node (so we do not change original object) # remove children of node for child in node.get_children(): node.remove_child(child) # add node to growing tree search_results = T_forward.iter_search_nodes( name=node.up.name) # find ancestor next(search_results).add_child( node) # add node as child of ancestor T_forward_frozen = copy.deepcopy(T_forward) T_forward_frozen_as_jungle_tree = Tree(T_forward_frozen, name=None, params=None) trees_forward.append(T_forward_frozen_as_jungle_tree) return trees_forward
def assess_separation_ranger(folder): trees = [] count = 1 while os.path.isfile('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)): trees.append(ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1])) count += 1 if not trees: return 0 for tree in trees: taxids = genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.get_leaf_names()]), 'taxid'] phyla = lineages.loc[taxids, 'phylum'].values.astype('int').astype('str').tolist() if not len(set(phyla)) == 2 or not set(['1117', '1224']).issubset(phyla): continue child1_phyla = set(lineages.loc[genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.children[0].get_leaf_names()]), 'taxid'], 'phylum'].values.astype('int').astype('str').tolist()) child2_phyla = set(lineages.loc[genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.children[1].get_leaf_names()]), 'taxid'], 'phylum'].values.astype('int').astype('str').tolist()) if len(child1_phyla) == len(child2_phyla) == 1: return 1 return 0
def __init__(self, newick): etree = ete3.Tree(newick, format = 0) etree.unroot() self.all_nodes = [] print("Number of leaves in the ete3 tree: " + str(len(etree))) n = self.rec_build(etree, True) assert(n == None)
def assess_tree_balance_ranger(folder): ratios = [] trees = [] count = 1 num_transfers = [] num_duplications = [] sub_count = 1 while os.path.isfile('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)): # trees.append(ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1])) tree = ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1]) children = sorted(tree.children, key=len) ratio = len(children[0])/float(len(children[1])) while sub_count <= count * 20: reconciliation = open('%s/%s.reconciliation%i' %(folder, folder, sub_count)).read() ratios.append(ratio) num_transfers.append( len(re.findall('Transfer, Mapping --> \S+, Recipient --> \S+$', reconciliation, re.M))) num_duplications.append(len(re.findall('Duplication, Mapping --> \S+$', reconciliation, re.M))) sub_count += 1 count += 1 # for tree in trees: # children = sorted(tree.children, key=len) # ratios.append(len(children[0])/float(len(children[1]))) num_internal_nodes = len(re.findall('^m\d+ = LCA', reconciliation, re.M)) dtl_cost = re.search('The minimum reconciliation cost is: (\d+)', reconciliation).group(1) return {'balance_ratios':ratios, 'num_transfers':num_transfers, 'num_duplications':num_duplications, 'num_internal_nodes':num_internal_nodes, 'dtl_cost':int(dtl_cost)} return ratios, int(dtl_cost), len(tree)
class NewickTreeDatum(Data): type_name = "newicktree" native_type = ete3.Tree cast_function = lambda x: ete3.Tree(x, format=1 ) #TODO: support other formats db_cast_function = lambda x: x.write() value = models.TextField()
def shuffle_main(args): tree = read_tree(args.infile, args.format) tree_original = tree if args.topology: num_leaf = len(tree.get_leaves()) new_tree = ete3.Tree() new_tree.populate(size=num_leaf) num_new_branch = len([n for n in new_tree.traverse()]) branch_lengths = get_shuffled_branch_lengths(nodes=tree) population = branch_lengths + branch_lengths # Random tree may have more branches new_branch_lengths = random.sample(population=population, k=num_new_branch) new_branch_lengths[0] = 0 # Root node for n, new_branch_length in zip(new_tree.traverse(), new_branch_lengths): n.dist = new_branch_length leaf_names = tree.get_leaf_names() for leaf, leaf_name in zip(new_tree.get_leaves(), leaf_names): leaf.name = leaf_name tree = new_tree if args.topology | args.branch_length: nodes = [n for n in tree.traverse() if not n.is_root()] branch_lengths = get_shuffled_branch_lengths(nodes) for node, new_bl in zip(nodes, branch_lengths): node.dist = new_bl if args.label: leaf_names = tree.get_leaf_names() random.shuffle(leaf_names) for leaf, new_name in zip(tree.get_leaves(), leaf_names): leaf.name = new_name print_rf_dist(tree1=tree_original, tree2=tree) write_tree(tree, args, format=args.format)
def treeCheck(treePath, alnf, optionTree): """ Check if the tree isn't corruped @param1 treePath: tree's path @param2 alnf: alignment's path @param3 optionTree: Boolean (treerecs option) @return1 treePath: tree's path @return2 optionTree: Boolean (treerecs option) """ logger = logging.getLogger("main") if treePath != "": if not os.path.exists(treePath): try: treePath = buildSpeciesTree(treePath, alnf) except: logger.warning("The path for the species tree doesn't exist.") logger.warning("Duplication option will be set to False.") optionTree = False treePath = "" else: if not ete3.Tree(treePath): logger.warning("The species tree is corrupted.") logger.warning("Duplication option will be set to False.") optionTree = False treePath = "" return treePath, optionTree
def gene_tree(): t = ete3.Tree("((1_0, 5_1),4_3);") # (C. elegans, Drosophila),Danio for n in t.traverse(): n.add_feature('dup', False) n = t & '4_3' n.up.add_feature('sp_node', 'N3') return t
def gather_tree_family_genus(genus_dict): '''this function gathers the newick tree from MEGAN genera and gives all genera belonging to a specific family''' megan_tree = ete3.Tree("/ebio/abt6_projects9/metagenomic_controlled/Programs/metagenomics_pipeline/data/megan_genus_tree_10_2_2018.tre", format=1) # now iterate through every leaf and indicate its parent node (the leaves are all genera) genus_family_map = {} for leaf in megan_tree.get_leaves(): genus_family_map[leaf.name] = leaf.get_ancestors()[0].name family_average = {value: [] for value in set(genus_family_map.values())} for key in genus_family_map: try: size = genus_dict[key][0] fam = genus_family_map[key] family_average[fam].append(size) except KeyError: fam = genus_family_map[key] if len(family_average[fam]) > 0: print(family_average[fam]) pass if len(family_average[fam]) == 0: family_average[fam].append(49.99999) # if there is no representation for genus give genome size of 50Mb # pass family_final = {} for key in family_average: if len([rec for rec in family_average[key] if rec != 49.99999]) > 0: fam_intermediate = [rec for rec in family_average[key] if rec != 49.99999] family_final[key] = [np.mean(fam_intermediate), 0] else: family_final[key] = [50, 0] return family_final, genus_family_map
def test_scoring(GSC_function): """Test a function implementing the GSC scoring algorithm (GSCfunction) on the tree used as an example in the paper. Note: the weights are assessed as before the normalisation to average=1 step described in the paper.""" import ete3 function_name = GSC_function.__name__ if not callable(GSC_function): raise ValueError( "The provided input function {} is not a function.".format( function_name)) newick_tree = "(D:80,(C:50,(A:20,B:20)two:30)three:30);" tree = ete3.Tree(newick_tree, format=1) scores = GSC_function(tree) correct_scores = {'A': 43.75, 'C': 62.5, 'B': 43.75, 'D': 80.0} correct = 0 for key in sorted(scores): print("Leaf {}: Correct score: {} - Score obtained by function {}: {}". format(key, correct_scores[key], function_name, scores[key])) if correct_scores[key] == scores[key]: correct += 1 print("{}% scores were correctly predicted ({} out of {})".format( float(correct) / len(scores) * 100, correct, len(scores))) if correct == len(scores): return True else: return False