def saveTrees(name1, name2, name3): # name1 - plik z opisem taksonomicznym kontigów w formacie tsv # name2 - lokalizacja pliku z drzewem otrzymanym przez PhyloMagnet # name3 - lokalizacja, w której ma zostać zapisane nowe drzewo f2 = open(name1).read() f2 = f2.split("\n")[:-1] na_s = f2[0].split("-")[0] o = {} for i in f2: name = i.split("-")[0] if name != na_s: t = Tree(name2 + na_s + ".newick") for leaf in t.get_leaves(): if leaf.name in o.keys(): leaf.name = leaf.name + "_" + o[leaf.name] t.write(format=1, outfile=name3 + na_s + ".nw") na_s = name o = {} l = i.split("\t") c = l[0].split("-")[1:] c = '-'.join(c) c = "Q_C" + c[1:][:9] + c[1:][10:] o[c] = l[1] t = Tree(name2 + name + ".newick") for leaf in t.get_leaves(): if leaf.name in o.keys(): leaf.name = leaf.name + "_" + o[leaf.name] t.write(format=1, outfile=name3 + name + ".nw") return "Done"
def write_ancgenes(clustered_genes, treedir, out_ancgenes, clusters_to_load=None): """ Writes the output 3-columns file, tab-separated. Args: clustered_genes (dict): class of gene families treedir (str): path to the gene trees out_ancgenes (str): name of the output file clusters_to_load (list, optional): write only entries for these given family classes. """ k = 0 with open(out_ancgenes, 'w') as outfile: for gene in clustered_genes: cluster = clustered_genes[gene] #Load only required family classes if clusters_to_load is not None and cluster not in clusters_to_load: continue #try different name for the input tree given the tree directory treefile = treedir + '/' + gene + '.nhx' if not os.path.exists(treefile): treefile = treedir + '/' + gene + '.nh' if not os.path.exists(treefile): treefile = treedir + '/C_' + gene + '.nh' if not os.path.exists(treefile): treefile = treedir + "/" + gene + "_final.nhx" assert os.path.exists( treefile), f"The file {treefile} does not exist" tree = Tree(treefile) leaves = { '_'.join(i.name.split('_')[:-1]) for i in tree.get_leaves() } if leaves == {''}: leaves = {i.name for i in tree.get_leaves()} descendants = sorted(list(leaves)) if clusters_to_load is not None: cluster = str(clusters_to_load.index(cluster)) outfile.write(gene + '\t' + ' '.join(descendants) + '\t' + cluster + '\n') k += 1
def is_proper_newick(newick_data, dont_raise=False, names_with_only_digits_ok=False): try: tree = Tree(newick_data, format=1) seen = set([]) duplicates = set([]) for leaf in tree.get_leaves(): name = leaf.name if name in seen: duplicates.add(name) seen.add(name) if len(duplicates): raise Exception("Your newick tree contains duplicate leaves, here is a list of them: %s" % ", ".join(duplicates)) except Exception as e: if dont_raise: return False else: raise FilesNPathsError("Your tree doesn't seem to be properly formatted. Here is what ETE had " "to say about this: '%s'. Pity :/" % e) names_with_only_digits = [n.name for n in tree.get_leaves() if n.name.isdigit()] if len(names_with_only_digits) and not names_with_only_digits_ok: raise FilesNPathsError("Your tree contains names that are composed of only digits (like this one: '%s'). Sadly, anvi'o " "is not happy with such names in newick trees or clustering dendrograms :( Anvi'o developers " "apologize for the inconvenience." % (names_with_only_digits[0])) return True
def phylogenetic_tree_to_cluster_format(tree, pairwise_estimates): """ Convert a phylogenetic tree to a 'cluster' data structure as in ``fastcluster``. The first two columns indicate the nodes that are joined by the relevant node, the third indicates the distance (calculated from branch lengths in the case of a phylogenetic tree) and the fourth the number of leaves underneath the node. Note that the trees are rooted using midpoint-rooting. Example of the data structure (output from ``fastcluster``):: [[ 3. 7. 4.26269776 2. ] [ 0. 5. 26.75703595 2. ] [ 2. 8. 56.16007598 2. ] [ 9. 12. 78.91813609 3. ] [ 1. 11. 87.91756528 3. ] [ 4. 6. 93.04790855 2. ] [ 14. 15. 114.71302639 5. ] [ 13. 16. 137.94616373 8. ] [ 10. 17. 157.29055403 10. ]] :param tree: newick tree file :param pairwise_estimates: pairwise Ks estimates data frame (pandas) (only the index is used) :return: clustering data structure, pairwise distances dictionary """ id_map = { pairwise_estimates.index[i]: i for i in range(len(pairwise_estimates))} t = Tree(tree) # midpoint rooting midpoint = t.get_midpoint_outgroup() if not midpoint: # midpoint = None when their are only two leaves midpoint = list(t.get_leaves())[0] t.set_outgroup(midpoint) logging.debug('Tree after rooting:\n{}'.format(t.get_ascii())) # algorithm for getting cluster data structure n = len(id_map) out = [] pairwise_distances = {} for node in t.traverse('postorder'): if node.is_leaf(): node.name = id_map[node.name] id_map[node.name] = node.name # add identity map for renamed nodes # to id_map for line below pairwise_distances[node.name] = { id_map[x.name]: node.get_distance(x) for x in t.get_leaves() } else: node.name = n n += 1 children = node.get_children() out.append( [children[0].name, children[1].name, children[0].get_distance(children[1]), len(node.get_leaves())]) return np.array(out), pairwise_distances
def initialise(rate): tree = Tree() tree.add_features(extinct=False) tree.dist = 0.0 node = random.choice(tree.get_leaves()) tree = birth(tree, node) leaf_nodes = tree.get_leaves() wtime = random.expovariate(rate) for leaf in leaf_nodes: if not leaf.extinct: leaf.dist += wtime return tree
def tree_distances(file): t = Tree(file) branch_len_out = open(file + ".patristic-dist.tsv", "w") avg_distance_leaves = 0 # Computing patristic distance matrix header = "" all_leaves = t.get_leaves() for i in all_leaves: header = header + "\t" + i.name nb_of_distances = 0 max_len = 0 min_len = 9999999999999999 branch_len_out.write(header+"\n") for leaf1 in all_leaves: row = "" row += str(leaf1.name) for leaf2 in all_leaves: distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999) avg_distance_leaves += distance row += "\t%f" % distance nb_of_distances += 1 if distance > max_len: max_len = distance if distance < min_len and distance > 0: min_len = distance branch_len_out.write(row+"\n") branch_len_out.close()
def merge_trees_and_write(trees, outgr, outfile, keep_br=False): """ Merges two subtrees independently resolved into a single tree and adds the outgroup gene. Writes the result to file. Args: trees (list of ete3.Tree): Tree(s) to merge outgr (str): Outgroup gene name outfile (str): Output filename """ merged_tree = Tree() for tree in trees: merged_tree.add_child(tree) #merge the two and place outgroup correctly merged_final = Tree() merged_final.add_child(merged_tree) merged_final.add_child(name=outgr) merged_final.prune([i for i in merged_final.get_leaves()]) if keep_br: merged_final.write(outfile=outfile) else: merged_final.write(outfile=outfile, format=9)
def get_orthogroups_genes(ctree, outgr_gene_name): """ Finds the two polytomies in the constrained tree topology. Args: ctree (str): input tree file in newick format. outgr_gene_name (str): gene name of the outgroup gene. Returns: dict: the 1 or 2 polytomy node(s) and their corresponding size. str: full outgroup gene name (with species tag) """ ctree = Tree(ctree) orthogroups = {} outgr = '' for leaf in ctree.get_leaves(): if outgr_gene_name != '_'.join(leaf.name.split('_')[:-1]): parent_node = leaf.up if parent_node not in orthogroups: orthogroups[parent_node] = len(parent_node.get_leaves()) else: outgr = leaf.name if len(orthogroups) == 2: break return orthogroups, outgr
def get_anc_order(tree_file, ancestors, tips_to_root=False): """ Orders input ancestors with respect to their position in the species tree. Can be ordered from root to tips (default) or tips to root. Args: tree_file (str): Path to the input newick formatted tree. ancestors (list of str): list of ancestor names Returns: OrderedDict: ancestor names in the requested order (keys) and list of ancestors in the input list that are below it (values). """ tree = Tree(tree_file, format=1) tree.prune([i for i in tree.get_leaves()]) dist_to_root = {i: tree.get_distance(i) for i in ancestors} anc_order = sorted(dist_to_root, key=dist_to_root.get) if tips_to_root: anc_order = anc_order[::-1] anc_order_dict = OrderedDict() for anc in anc_order: anc_order_dict[anc] = [] anc_node = search_one_node(tree, anc) for anc2 in ancestors: if anc != anc2: if is_below(anc_node, anc2): anc_order_dict[anc].append(anc2) return anc_order_dict
def smart_reroot(treefile, outgroupfile, outfile, format=0): """ simple function to reroot Newick format tree using ete2 Tree reading format options see here: http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees """ tree = Tree(treefile, format=format) leaves = [t.name for t in tree.get_leaves()][::-1] outgroup = [] for o in must_open(outgroupfile): o = o.strip() for leaf in leaves: if leaf[:len(o)] == o: outgroup.append(leaf) if outgroup: break if not outgroup: print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr) return treefile try: tree.set_outgroup(tree.get_common_ancestor(*outgroup)) except ValueError: assert type(outgroup) == list outgroup = outgroup[0] tree.set_outgroup(outgroup) tree.write(outfile=outfile, format=format) logging.debug("Rerooted tree printed to {0}".format(outfile)) return outfile
def tree_distances(file): t = Tree(file) branch_len_out = open(file + ".patristic-dist.tsv", "w") avg_distance_leaves = 0 # Computing patristic distance matrix header = "" all_leaves = t.get_leaves() for i in all_leaves: header = header + "\t" + i.name nb_of_distances = 0 max_len = 0 min_len = 9999999999999999 branch_len_out.write(header + "\n") for leaf1 in all_leaves: row = "" row += str(leaf1.name) for leaf2 in all_leaves: distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999) avg_distance_leaves += distance row += "\t%f" % distance nb_of_distances += 1 if distance > max_len: max_len = distance if distance < min_len and distance > 0: min_len = distance branch_len_out.write(row + "\n") branch_len_out.close()
def RapidNJ(names, profiles, embeded, handle_missing='pair_delete', **params): dist = distance_matrix.get_distance('symmetric', profiles, handle_missing) dist_file = params['tempfix'] + 'dist.list' with open(dist_file, 'w') as fout: fout.write(' {0}\n'.format(dist.shape[0])) for n, d in enumerate(dist): fout.write('{0!s:10} {1}\n'.format( n, ' '.join(['{:.6f}'.format(dd) for dd in d]))) del dist, d Popen([ params['RapidNJ_{0}'.format(platform.system())], '-n', '-x', dist_file + '_rapidnj.nwk', '-i', 'pd', dist_file ], stdout=PIPE, stderr=PIPE).communicate() tree = Tree(dist_file + '_rapidnj.nwk') for fname in glob(dist_file + '*'): os.unlink(fname) try: tree.set_outgroup(tree.get_midpoint_outgroup()) tree.unroot() except: pass for leaf in tree.get_leaves(): leaf.name = names[int(leaf.name.strip("'"))] return tree
def closest_dna_dist(treefile): """ Using get closest leaf in ete which according to the description gets the closest descendent leaf but may or may not! Note that this may not be symmetric. :param treefile: The tree file to read :return: a dict of a node and its closest leaf """ global verbose if verbose: sys.stderr.write("Getting closest distances\n") tree = Tree(treefile) dist = {} leaves = tree.get_leaves() # prepopulate the hash for l in leaves: dist[l.name] = {} for i in range(len(leaves)): closest, distance = leaves[i].get_closest_leaf() dist[leaves[i].name][closest.name] = distance if verbose: sys.stderr.write("{} -> {} : {}\n".format(leaves[i].name, closest.name, distance)) if verbose: sys.stderr.write("\tDone\n") return dist
def is_proper_newick(newick_data, dont_raise=False): try: tree = Tree(newick_data, format=1) seen = set([]) duplicates = set([]) for leaf in tree.get_leaves(): name = leaf.name if name in seen: duplicates.add(name) seen.add(name) if len(duplicates): raise Exception( "Your newick tree contains duplicate leaves, here is a list of them: %s" % ", ".join(duplicates)) except Exception as e: if dont_raise: return False else: raise FilesNPathsError( "Your tree doesn't seem to be properly formatted. Here is what ETE had\ to say about this: '%s'. Pity :/" % e) return True
def time_tree(newick): tree = Tree(newick) t0 = time.time() sum3_dt = polynomial_sum3_performance(tree, tree.get_leaves().__len__() + 1)[1] tf = time.time() return tf - t0, sum3_dt
def convert_tree(treefile, output, d_conv=None, text=''): """ Converts gene IDs in an input tree. A conversion dictionary can be given, otherwise it is generated. Args: treefile (file): input tree in newick format. output (str): name for the output file. d_conv (dict, optional): Conversion from old to new IDs. text (str, optional): Debug information Returns: dict: Conversion old to new IDs. """ tree = Tree(treefile) if not d_conv: leaves = [i.name for i in tree.get_leaves()] #For treebest-type IDs (i.e last '_' is followed by species name): #generated IDs are 3 letters from species name + a unique number. ids = [ gene.split('_')[-1][0:3] + str(nb) for nb, gene in enumerate(leaves) ] d_conv = dict(zip(leaves, ids)) leaves = tree.get_leaves() assert len(leaves) == len( d_conv), "Trees have different number of leaves {}".format(text) for leaf in leaves: assert leaf.name in d_conv, "{} present in {} but not in all trees".format( leaf.name, treefile) leaf.name = d_conv[leaf.name] tree.prune([i for i in tree.get_leaves()]) tree.write(outfile=output, format=9) return d_conv
def simplify_names( input_path: str, output_path: str, names_translator: t.Optional[t.Dict[str, str]] = None, ) -> t.Optional[t.Dict[str, str]]: """ :param input_path: path with the original sequence names :param output_path: path to which the sequences with the new names will be written :param names_translator: translator of new to old names. if not provided, simple names will be generated and returned :return: """ input_is_tree = False if ".nwk" in str(input_path): input_is_tree = True if not input_is_tree: seq_records = list(SeqIO.parse(input_path, "fasta")) if not names_translator: s = 1 new_to_orig_name = dict() for record in seq_records: new_to_orig_name[f"S{s}"] = record.description record.description = record.id = record.name = f"S{s}" s += 1 SeqIO.write(seq_records, output_path, "fasta") return new_to_orig_name else: reversed_names_translator = { names_translator[key]: key for key in names_translator } for record in seq_records: record.description = ( record.name) = record.id = reversed_names_translator[ record.description] SeqIO.write(seq_records, output_path, "fasta") else: with open(input_path, "r") as infile: tree_str = infile.read() tree = Tree(tree_str, format=1) tree_leaves = tree.get_leaves() if not names_translator: s = 1 new_to_orig_name = dict() for leaf in tree_leaves: new_to_orig_name[f"S{s}"] = leaf.name leaf.name = f"S{s}" s += 1 tree.write(outfile=output_path, format=5) return new_to_orig_name else: reversed_names_translator = { names_translator[key]: key for key in names_translator } for leaf in tree_leaves: leaf.name = reversed_names_translator[leaf.name] tree.write(outfile=output_path, format=5)
def make_tree_from_groups(subtree_leaves, species_groups, groups_are_genes=False): """ Builds a gene tree from groups of species or groups of genes. Args: subtree_leaves (list of ete3.nodes): all genes to place in the tree species_groups (list of str): species to group together (first group is outgroup) groups_are_genes (bool, optional): set to True if species_groups are groups of genes Returns: ete3.Tree : resulting gene tree str : one outgroup gene name, to identify the tree """ tree = Tree() outgr, group1, group2 = species_groups if not groups_are_genes: outgr = {i.name for i in subtree_leaves if i.S in outgr} group1 = {i.name for i in subtree_leaves if i.S in group1} group2 = {i.name for i in subtree_leaves if i.S in group2} outgr_gene = list(outgr)[0] if len(outgr) >= 2: outgr_node = tree.add_child(name='outgr_node') for i in outgr: outgr_node.add_child(name=i) else: outgr = outgr.pop() tree.add_child(name=outgr) if group1 and group2: next_node = tree.add_child(name="anc_3r") gr1 = next_node.add_child(name="gr1") for i in group1: gr1.add_child(name=i) gr2 = next_node.add_child(name="gr2") for i in group2: gr2.add_child(name=i) elif group1: next_node = tree.add_child(name="anc_3r") for i in group1: next_node.add_child(name=i) elif group2: next_node = tree.add_child(name="anc_3r") for i in group2: next_node.add_child(name=i) tree.prune(tree.get_leaves()) return tree, outgr_gene
def _add_observed_isotypes( tree: ete3.Tree, newidmap: Dict[str, str], isotype_order: Sequence[str], weight_matrix: Optional[Sequence[Sequence[float]]] = None, ): # Drop observed nodes as leaves and explode by observed isotype: # Descend internal observed nodes as leaves: newisotype = IsotypeTemplate(isotype_order, weight_matrix=weight_matrix).new for node in list(tree.iter_descendants()): if node.abundance > 0 and not node.is_leaf(): newchild = ete3.TreeNode(name=node.name) newchild.add_feature("sequence", node.sequence) newchild.add_feature("abundance", node.abundance) node.abundance = 0 node.add_child(child=newchild) # Now duplicate nodes which represent multiple isotypes for node in list(tree.get_leaves()): if node.abundance == 0: node.add_feature("isotype", newisotype("?")) else: try: thisnode_isotypemap = newidmap[node.name] except KeyError as e: warnings.warn( f"The sequence name {e} labels an observed node, but no mapping to an original sequence ID was found." " Isotype will be assumed ambiguous.") thisnode_isotypemap = { "?": {f"Unknown_id_{n+1}" for n in range(node.abundance)} } if "?" in thisnode_isotypemap: warnings.warn( f"The sequence name {node.name} labels an observed node, and corresponds to sequence IDs for " "which no observed isotype was provided. " f" Isotype will be assumed ambiguous for: {', '.join(thisnode_isotypemap['?'])}" ) # node.name had better be in newidmap, since this is an observed node if len(thisnode_isotypemap) > 1: for isotype, cell_ids in thisnode_isotypemap.items(): # add new node below this leaf node. Must be below, and not child # of parent, to preserve max parsimony in case that node.up has # different sequence from node. newchild = ete3.TreeNode(name=node.name) newchild.add_feature("abundance", len(cell_ids)) newchild.add_feature("sequence", node.sequence) newchild.add_feature("isotype", newisotype(isotype)) node.add_child(child=newchild) node.abundance = 0 else: node.isotype = newisotype(list(thisnode_isotypemap.keys())[0]) # Now add ancestral ambiguous isotypes for node in tree.traverse(): if not node.is_leaf(): node.add_feature("isotype", newisotype("?"))
def rerootTree(treefile, output, ogterm="OG--", fmat=3): intree = Tree(treefile) og = [x for x in intree.get_leaves() if x.name.startswith(ogterm)] if not len(og): return False og = og[0] intree.set_outgroup(og.name) intree.ladderize(direction=1) intree.write(outfile=output, format=fmat) return True
def main(): if args.exclpops is not None: excluded_populations = args.exclpops.split(',') else: excluded_populations = [] if args.exclindivs is not None: excluded_individuals = args.exclindivs.split(',') else: excluded_individuals = [] # i = 0 with gzip.open(args.input_file_name, 'rb') as input_file: with gzip.open(args.output_file_name, 'wb') as output_file: header = True for line in input_file: fields = [x.decode() for x in line.split()] if header: header = False fields += [ 'pruned_tmrca', 'pruned_tmrca_half', 'pruned_coal_half' ] s = '\t'.join(fields) + '\n' output_file.write(s.encode()) continue tree = Tree(fields[32]) #.decode()) included_leaves = list() for leaf in tree.get_leaves(): if not any(pop in leaf.name for pop in excluded_populations) \ and not any(indiv in leaf.name for indiv in excluded_individuals): # included_leaves.append(leaf) included_leaves.append(leaf.name) #tree.prune(included_leaves, preserve_branch_length=True) prune(tree, included_leaves) # hack to ensure there is no nondicotomic node under the root: if len(tree.children) == 1 and not tree.children[0].is_leaf(): tree.children[0].delete(preserve_branch_length=True) assert set(tree.get_leaf_names()) == set(included_leaves) # if not node.is_leaf() and len(node.children) == 1 and not node.children[0].is_leaf(): # node.children[0].delete(preserve_branch_length=True) tmrca, tmrca_half, coal_half = tmrca_stats(tree) fields += [str(tmrca), str(tmrca_half), str(coal_half)] s = '\t'.join(fields) + '\n' output_file.write(s.encode())
def remove_anc(tree_file, out_file): """ Removes any internal node name, such as ancestor names, in the input tree and writes it to a new file. Args: tree_file (str): Path to the input newick formatted tree. out_file (str): Path for the output file. """ tree = Tree(tree_file, format=1) tree.prune([i for i in tree.get_leaves()]) tree.write(outfile=out_file, format=9)
def get_scorpios_aore_tree(gene_list, treefile, outgroups, outgr_gene): """ Loads the AORe gene tree built by SCORPiOs. Args: gene_list (dict): dict of gene_names (key) : species_names (value) to keep in the tree treefile (str): name of the input tree file outgroups (list of str): list of outgroup species to keep/add in tree outgr_gene (str): name of the outgroup gene Returns: ete3.Tree : the loaded tree """ tree = Tree(treefile) tleaves = tree.get_leaves() #remove sp name for leaf in tleaves: leaf.name = '_'.join(leaf.name.split('_')[:-1]) tree.prune([i for i in tleaves if i.name in gene_list]) leaves = {i.name for i in tree.get_leaves()} if leaves != set(gene_list.keys()): diff = set(gene_list.keys()).difference(leaves) outgr_node = tree.get_leaves_by_name(outgr_gene)[0] outgr_t = Tree() for gened in diff: if gene_list[gened] in outgroups: outgr_t.add_child(name=gened) else: return None #TODO: print the kind of cases covered here? outgr_t.add_child(name=outgr_gene) outgr_node.add_child(outgr_t) tree.prune(tree.get_leaves()) return tree
def read_clustertree_fromnewick(treefpath: str): """reads in clustertree as defined in write_clustertree_fromnewick Arguments: treefpath: newick tree with leaf clusternames = including |-delimited accs Returns: ete3.Tree with same names and added feature value of accs = list of subtree accs """ ctree = Tree(treefpath) for lnode in ctree.get_leaves(): accnames = lnode.name lnode.add_feature('accs', [x for x in accnames.strip('|').split('|')]) return ctree
def ninja(names, profiles, embeded, handle_missing='pair_delete', **params): dist = distance_matrix.get_distance('symmetric', profiles, handle_missing) dist = dist / profiles.shape[1] dist_file = params['tempfix'] + 'dist.list' with open(dist_file, 'w') as fout: fout.write(' {0}\n'.format(dist.shape[0])) for n, d in enumerate(dist): fout.write('{0!s:10} {1}\n'.format( n, ' '.join(['{:.6f}'.format(dd) for dd in d]))) del dist, d free_memory = int(0.9 * psutil.virtual_memory().total / (1024.**2)) ninja_out = Popen([ 'java', '-d64', '-Xmx' + str(free_memory) + 'M', '-jar', params['ninja_{0}'.format( platform.system())], '--in_type', 'd', dist_file ], stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if ninja_out[1].find('64-bit JVM') >= 0: ninja_out = Popen([ 'java', '-Xmx1200M', '-jar', params['ninja_{0}'.format( platform.system())], '--in_type', 'd', dist_file ], stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() with open(dist_file + '.nwk', 'wt') as fout: fout.write(ninja_out[0]) tree = Tree(dist_file + '.nwk') for fname in glob(dist_file + '*'): os.unlink(fname) for node in tree.traverse(): node.dist *= profiles.shape[1] try: tree.set_outgroup(tree.get_midpoint_outgroup()) tree.unroot() except: pass for leaf in tree.get_leaves(): leaf.name = names[int(leaf.name.strip("'"))] return tree
def tree_distances_info(file,scale,seq_len): t = Tree(file) # branch_len_matrix_f = file + ".branches-len.tsv" branch_len_out = open(file + ".%d.patristic-dist.tsv" % seq_len, "w") tree_info = open(file + ".%d.tree-info.txt" % seq_len, "w") avg_distance_leaves = 0 # Computing patristic distance matrix header = "" all_leaves = t.get_leaves() for i in all_leaves: header = header + "\t" + i.name nb_of_distances = 0 max_len = 0 min_len = 99999999999999 branch_len_out.write(header+"\n") for leaf1 in all_leaves: row = "" row += str(leaf1.name) for leaf2 in all_leaves: avg_distance_leaves += leaf1.get_distance(leaf2) distance = leaf1.get_distance(leaf2) row += "\t%f" % distance nb_of_distances += 1 if distance > max_len: max_len = distance if distance < min_len and distance > 0: min_len = distance branch_len_out.write(row+"\n") tree_info.write("Scale_factor(1=original-tree)\t%f\n" % scale) tree_info.write("Seq_Length\t%d\n" % seq_len) tree_info.write("Number_of_leaves_(taxa)\t%d\n" % len(all_leaves)) tree_info.write("Minimal_patristic_distance\t%f\n" % min_len) tree_info.write("Maximal_patristic_distance\t%f\n" % max_len) tree_info.write("Average_patristic_distance\t%f\n" % (avg_distance_leaves/(nb_of_distances*scale))) print("Scale_factor(1=original-tree)\t%f" % scale) print("Seq_Length\t%d" % seq_len) print("Number_of_leaves_(taxa)\t%d" % len(all_leaves)) print("Minimal_patristic_distance\t%f" % min_len) print("Maximal_patristic_distance\t%f" % max_len) print("Average_patristic_distance\t%f" % (avg_distance_leaves/(nb_of_distances*scale))) branch_len_out.close() tree_info.close()
def make_tree_fig(tree_file, out_name, tax_level=None): with open(tree_file) as handle: lines = handle.readlines() if len(lines) > 0: ete_tree = Tree(lines[0][:-1].replace(";IM", "-IM").replace( ";CP", "-")) else: return None if tax_level: taxa = {} # taxa = {xx : [x for x in xx.name.replace(" ","-").split("_")[1:] if len(x) > 0 and not x[0].isdigit() ] for xx in ete_tree.get_leaves()} for xx in ete_tree.get_leaves(): id = xx.name taxon = taxas.get(id) if taxon: xx.name = ";".join([id] + taxon if taxon else []) taxa[xx] = taxon[tax_level] if taxon and len( taxon) > tax_level else None for leaf in taxa: leaf.set_style(NodeStyle()) if taxa.get(leaf) and cols.get(taxa[leaf]): leaf.img_style["bgcolor"] = cols[taxa[leaf]] elif "acI" in leaf.name: leaf.img_style["bgcolor"] = cols['acI'] else: taxa = None styl = TreeStyle() styl.mode = 'c' # styl.arc_start = -180 # styl.arc_span = 180 # print(out_name) ete_tree.render(out_name, w=len(ete_tree.get_leaves()) * 5, tree_style=styl)
def __init__(self, tree_paths, suffix=".aa.tre.renamed"): self.trees = {} tree_files = tree_paths for t in tree_files: this_marker = os.path.basename(t).replace(suffix, "") tree = Tree(t) for tip in [n for n in tree.get_leaves()]: spl = tip.name.split("&") spl_2 = spl[1].split("|") tree_name = spl[0] isolate = spl_2[0] phead = spl_2[1] tip.add_feature("isolate", isolate) tip.add_feature("gene", phead) tip.add_feature("genus", tip.name.split("_")[0]) self.trees[this_marker] = tree
def select_rep_genomes(genomedb, treefile, threshold=0.01, output="rep_strains.txt"): t = Tree(os.path.abspath(treefile)) o = open(os.path.abspath(output), 'w') good_strains = [] to_skip = [] strains = [x.name for x in t.get_leaves()] contigs = {} for line in open(os.path.join(genomedb, "genome_metadata.txt"), 'r'): if line.startswith("assembly_id"): continue else: vals = line.rstrip().split("\t") if vals[2] in strains: contigs[vals[2]] = int(vals[4]) for node in t.iter_descendants("preorder"): if node in to_skip: continue else: leafnodes = [x for x in node.get_leaves()] if len(leafnodes) > 100: continue else: if len(leafnodes) > 1: pairs = [p for p in combinations(leafnodes, 2)] dist = 0.0 for p in pairs: dist += node.get_distance(p[0], p[1]) if dist / len(pairs) < threshold: leafnodes.sort(key=lambda x: contigs[x.name]) good_strains.append(leafnodes[0].name) [ to_skip.append(desc) for desc in node.iter_descendants("preorder") ] if node.is_leaf(): good_strains.append(node.name) print(len(good_strains), "at threshold", threshold) o.write("\n".join(good_strains) + "\n") o.close() return
def remove_outgroup(tree, outgr): """ Loads a subtree and removes the outgroup gene. Args: tree (ete3.Tree): Input trree outgr (str): Outgroup gene name """ tree = Tree(tree) leaves = [i.name for i in tree.get_leaves()] outgr_gene = [i for i in leaves if outgr == '_'.join(i.split('_')[:-1])][0] tree.set_outgroup(tree & outgr_gene) tree.prune([i for i in leaves if i != outgr_gene]) return tree, outgr_gene
def make_matrix(treefile): """ Create a matrix from a tree file :param treefile: :return: """ tree = Tree(treefile) leaves = tree.get_leaves() paths = {x: set() for x in leaves} # get the paths going up the tree # we get all the nodes up to the last one and store them in a set sys.stderr.write("Precalculating distances\n") for n in leaves: if n.is_root(): continue movingnode = n while not movingnode.is_root(): paths[n].add(movingnode) movingnode = movingnode.up # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA leaf_distances = {x.name: {} for x in leaves} sys.stderr.write("Iterating over the leaves\n") for (leaf1, leaf2) in combinations(leaves, 2): # figure out the unique nodes in the path uniquenodes = paths[leaf1] ^ paths[leaf2] distance = sum(x.dist for x in uniquenodes) leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][ leaf1.name] = distance allleaves = sorted(leaf_distances.keys()) sys.stdout.write("\t".join([""] + allleaves) + "\n") for n in allleaves: sys.stdout.write(n + "\t") for m in allleaves: if m == n: sys.stdout.write("0\t") else: sys.stdout.write("{}\t".format(leaf_distances[n][m])) sys.stdout.write("\n")
def make_dists(treefile, printone, verbose): """ Create pairwise distances from a tree file :param treefile: the tree file to parse :param printone: if true we only print one copy of the pair (ie. A -> B). If false we print A->B and B->A :param verbose: make some additional output :return: """ tree = Tree(treefile) leaves = tree.get_leaves() paths = {x:set() for x in leaves} # get the paths going up the tree # we get all the nodes up to the last one and store them in a set if verbose: sys.stderr.write("Precalculating distances\n") for n in leaves: if n.is_root(): continue movingnode = n while not movingnode.is_root(): paths[n].add(movingnode) movingnode = movingnode.up # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA leaf_distances = {x.name:{} for x in leaves} if verbose: sys.stderr.write("Iterating over the leaves\n") for (leaf1, leaf2) in combinations(leaves, 2): # figure out the unique nodes in the path uniquenodes = paths[leaf1] ^ paths[leaf2] distance = sum(x.dist for x in uniquenodes) if printone: if leaf1.name < leaf2.name: print("{}\t{}\t{}".format(leaf1.name, leaf2.name, distance)) else: print("{}\t{}\t{}".format(leaf2.name, leaf1.name, distance)) else: print("{}\t{}\t{}".format(leaf1.name, leaf2.name, distance)) print("{}\t{}\t{}".format(leaf2.name, leaf1.name, distance))
def make_matrix(treefile): """ Create a matrix from a tree file :param treefile: :return: """ tree = Tree(treefile) leaves = tree.get_leaves() paths = {x:set() for x in leaves} # get the paths going up the tree # we get all the nodes up to the last one and store them in a set sys.stderr.write("Precalculating distances\n") for n in leaves: if n.is_root(): continue movingnode = n while not movingnode.is_root(): paths[n].add(movingnode) movingnode = movingnode.up # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA leaf_distances = {x.name:{} for x in leaves} sys.stderr.write("Iterating over the leaves\n") for (leaf1, leaf2) in combinations(leaves, 2): # figure out the unique nodes in the path uniquenodes = paths[leaf1] ^ paths[leaf2] distance = sum(x.dist for x in uniquenodes) leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance allleaves = sorted(leaf_distances.keys()) sys.stdout.write("\t".join([""] + allleaves) + "\n") for n in allleaves: sys.stdout.write(n + "\t") for m in allleaves: if m == n: sys.stdout.write("0\t") else: sys.stdout.write("{}\t".format(leaf_distances[n][m])) sys.stdout.write("\n")
def is_proper_newick(newick_data, dont_raise=False): try: tree = Tree(newick_data, format=1) seen = set([]) duplicates = set([]) for leaf in tree.get_leaves(): name = leaf.name if name in seen: duplicates.add(name) seen.add(name) if len(duplicates): raise Exception("Your newick tree contains duplicate leaves, here is a list of them: %s" % ", ".join(duplicates)) except Exception as e: if dont_raise: return False else: raise FilesNPathsError("Your tree doesn't seem to be properly formatted. Here is what ETE had\ to say about this: '%s'. Pity :/" % e) return True
def make_matrix(treefile, outputf): """ Create a matrix from a tree file :param treefile: the tree file to read :param outputf: the file to write the matrix to :return: """ tree = Tree(treefile, quoted_node_names=True, format=1) leaves = tree.get_leaves() paths = {x:set() for x in leaves} # get the paths going up the tree # we get all the nodes up to the last one and store them in a set sys.stderr.write("Precalculating distances\n") for n in leaves: if n.is_root(): continue movingnode = n while not movingnode.is_root(): paths[n].add(movingnode) movingnode = movingnode.up # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA leaf_distances = {x.name:{} for x in leaves} sys.stderr.write("Iterating over the leaves\n") sys.stderr.write("THere are {} leaves\n".format(len(leaves))) combi = combinations(leaves, 2) combidef = int(len(list(combi))/500); sys.stderr.write("There are {} combinations. Each dot is {} combinations\n".format(len(list(combi)), combidef)) c=0 cc=0 for (leaf1, leaf2) in combi: if (c % combidef) == 0: if cc == 5: sys.stdout.write(" ") cc=0 sys.stdout.write(".") cc+=1 c+=1 # figure out the unique nodes in the path uniquenodes = paths[leaf1] ^ paths[leaf2] distance = sum(x.dist for x in uniquenodes) leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance sys.stdout.write("\n") allleaves = sorted(leaf_distances.keys()) with open(outputf, 'w') as out: out.write("\t".join([""] + allleaves) + "\n") for n in allleaves: out.write(n + "\t") for m in allleaves: if m == n: out.write("0\t") else: out.write("{}\t".format(leaf_distances[n][m])) out.write("\n")
inh = open(sys.argv[1]) treestring = inh.readline() treestr = treestring.replace(';','') treestr = treestr + ";" inh.close() if len(treestr) == 0: print sys.argv[1] + "\tEmpty tree" quit() t = Tree(treestr) #define basic tree style ts = TreeStyle() ts.show_leaf_name = True ts.show_branch_support = True #for n in t.traverse() # if n.is_leaf(): #Here, we set up the annotations we want on the tree. For example, let's make the leaves with eukaryote sequences large red balls. for leaf in t.get_leaves(): if re.search('Eukaryota', leaf.name): leaf_style = NodeStyle() leaf_style["fgcolor"] = "red" leaf_style["size"] = 15 leaf.set_style(leaf_style) t.show(tree_style=ts)
import time from ete3 import Tree # Creates a random tree with 10,000 leaf nodes tree = Tree() tree.populate(10000) # This code should be faster t1 = time.time() for leaf in tree.iter_leaves(): if "aw" in leaf.name: print "found a match:", leaf.name, break print "Iterating: ellapsed time:", time.time() - t1 # This slower t1 = time.time() for leaf in tree.get_leaves(): if "aw" in leaf.name: print "found a match:", leaf.name, break print "Getting: ellapsed time:", time.time() - t1 # Results in something like: # found a match: guoaw Iterating: ellapsed time: 0.00436091423035 secs # found a match: guoaw Getting: ellapsed time: 0.124316930771 secs
for fasta in rec_dict[strain]: concat_dict[strain]=concat_dict[strain]+str(fasta.seq) #write out concatenated fasta handle=open("all_concat.fasta", "w") for rec in concat_dict: handle.write(">"+rec+"\n"+concat_dict[rec]+"\n") handle.close() #SeqIO.write(list(SeqIO.parse(open("all_concat.fasta"), "fasta")), "all_concat.phy", "phylip") #now write out tree for node in tree_old.traverse(): if node.is_leaf(): temp=node.name.replace('p','plate') node.name=temp tree_old.prune(strains, preserve_branch_length=T) write.tree(tree_old, "all_concat.newick", formatrue=1) for leaf in collapsed.get_leaves(): temp=leaf.name.replace('p', 'plate').split("_")[0] leaf.name=temp collapsed.write(outfile="concat_107.newick", format=1) test=subprocess(Popen(["/ebio/abt6_projects9/Pseudomonas_diversity/Programs/bin/ClonalFrameML", "/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/data/vis/clonalframe/concat_107.newick", "/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/data/vis/clonalframe/all_concat.fasta"]), stdout=subprocess.PIPE)) output = test.communicate()[0]