def parse_tree_tips(tree_dir): tips = [] tip_to_tree = {} for fn in os.listdir(tree_dir): if fn.endswith(".tree"): tree_name = fn.split(".")[0] tree = bt.loadNewick(tree_dir + "/" + fn, absoluteTime=False) for k in tree.Objects: if k.branchType == 'leaf' and "inserted" not in k.name: tips.append(k.name) tip_to_tree[k.name] = tree_name elif fn.endswith(".txt"): tree_name = fn.split(".")[0] with open(tree_dir + "/" + fn) as f: for l in f: tip_string = l.strip("\n").split("\t")[1] tip_list = tip_string.split(",") tips.extend(tip_list) for i in tip_list: tip_to_tree[i] = tree_name return tips, tip_to_tree
def find_tallest_tree(input_dir): tree_heights = [] for r, d, f in os.walk(input_dir): for fn in f: if fn.endswith(".tree"): num_taxa = 0 intro_name = "" with open(r + '/' + fn, "r") as f: for l in f: l = l.rstrip("\n") if l.startswith(" Dimensions NTax="): num_taxa = int(l.rstrip(";").split("=")[1]) intro_name = fn.rstrip(".tree") if num_taxa > 1: tree_file = os.path.join(r, fn) tree = bt.loadNewick(tree_file, absoluteTime=False) tips = [] for k in tree.Objects: if k.branchType == 'leaf': tips.append(k.name) tree_heights.append(tree.treeHeight) max_height = sorted(tree_heights, reverse=True)[0] return max_height
def relabel_tips(treeFile, outTree): myTree = bt.loadNewick(treeFile, absoluteTime=False) #myTree.setAbsoluteTime(2019.00) # need to set this to time of last sampled tip myTree.traverse_tree() ## required to set heights myTree.treeStats() ## report stats about tree for k in myTree.Objects: ## iterate over a flat list of branches if k.branchType == 'leaf': curr_name = k.numName if 'type' in k.traits: ttype = k.traits['type'] else: ttype = 'Unknown' #Assign tip unknown type if not given if 'time' in k.traits: time = k.traits['time'] else: time = 'Unknown' k.numName = curr_name + '_' + str(time) + '_' + ttype print(k.numName) newick = myTree.toString(traits=[], numName=True, nexus=False) newick = newick.replace('\'', '') tfile = open(outTree, "w") tfile.write(newick) tfile.close()
def test_newick(self): tree = bt.loadNewick('./tests/data/zika.nwk') expected_num_nodes = 564 assert len(tree.Objects) == expected_num_nodes, 'Newick tree does not contain correct number of nodes. Expected: {}. Observed: {}'.format(expected_num_nodes, len(tree.Objects)) max_height = round(max([i.height for i in tree.Objects]), 4) expected_height = 0.0058 assert max_height == expected_height, 'Newick tree height is not correct. Expected: {}. Observed: {}'.format(expected_height, max_height)
def find_tallest_tree(input_dir): tree_heights = [] for r, d, f in os.walk(input_dir): for fn in f: if fn.endswith(".tree"): tree_file = os.path.join(r, fn) tree = bt.loadNewick(tree_file, absoluteTime=False) tips = [] for k in tree.Objects: if k.branchType == 'leaf': tips.append(k.name) tree_heights.append(tree.treeHeight) max_height = sorted(tree_heights, reverse=True)[0] return max_height
def relabel_tips(treeFile, outTree): myTree = bt.loadNewick(treeFile, absoluteTime=False) #myTree.setAbsoluteTime(2019.00) # need to set this to time of last sampled tip myTree.traverse_tree() ## required to set heights myTree.treeStats() ## report stats about tree for k in myTree.Objects: ## iterate over a flat list of branches if k.branchType == 'leaf': curr_name = k.numName curr_name = curr_name.replace('|', '_') + '_Il' curr_name = curr_name.replace('2020_EPI_ISL_', '') k.numName = curr_name print(k.numName) newick = myTree.toString(traits=[], numName=True, nexus=False) newick = newick.replace('\'', '') tfile = open(outTree, "w") tfile.write(newick) tfile.close()
def make_all_of_the_trees(input_dir, tree_name_stem, taxon_dict, query_dict, desired_fields, custom_tip_labels, graphic_dict, min_uk_taxa=3): tallest_height = find_tallest_tree(input_dir) too_tall_trees = [] colour_dict_dict = defaultdict(dict) overall_df_dict = defaultdict(dict) overall_tree_count = 0 lst = sort_trees_index(input_dir) for trait, colour_scheme in graphic_dict.items(): colour_dict = find_colour_dict(query_dict, trait, colour_scheme) colour_dict_dict[trait] = colour_dict for fn in lst: lineage = fn treename = f"{tree_name_stem}_{fn}" treefile = f"{tree_name_stem}_{fn}.tree" nodefile = f"{tree_name_stem}_{fn}" num_taxa = 0 intro_name = "" with open(input_dir + "/" + treefile, "r") as f: for l in f: l = l.rstrip("\n") if l.startswith(" Dimensions NTax="): num_taxa = int(l.rstrip(";").split("=")[1]) intro_name = fn if num_taxa > 1: tree = bt.loadNewick(input_dir + "/" + treefile, absoluteTime=False) #make root line old_node = tree.root new_node = bt.node() new_node.children.append(old_node) old_node.parent = new_node old_node.length = 0.000015 new_node.height = 0 new_node.y = old_node.y tree.root = new_node tree.Objects.append(new_node) tips = [] for k in tree.Objects: if k.branchType == 'leaf': tips.append(k.name) if len(tips) < 1000: df_dict = summarise_node_table(input_dir, treename, taxon_dict) overall_df_dict[treename] = df_dict overall_tree_count += 1 make_scaled_tree(tree, treename, input_dir, len(tips), colour_dict_dict, desired_fields, tallest_height, taxon_dict, query_dict, custom_tip_labels, graphic_dict) else: too_tall_trees.append(lineage) continue return too_tall_trees, overall_tree_count, colour_dict_dict, overall_df_dict
def make_all_of_the_trees(input_dir, taxon_dict, query_id_dict, query_dict, desired_fields, min_uk_taxa=3): tallest_height = find_tallest_tree(input_dir) too_tall_trees = [] colour_dict_dict = defaultdict(dict) overall_tree_count = 0 lst = sort_trees_index(input_dir) for fn in lst: lineage = fn treename = "tree_" + str(fn) treefile = "tree_" + str(fn) + ".tree" num_taxa = 0 intro_name = "" with open(input_dir + "/" + treefile,"r") as f: for l in f: l = l.rstrip("\n") if l.startswith(" Dimensions NTax="): num_taxa = int(l.rstrip(";").split("=")[1]) intro_name = fn if num_taxa > 1: tree = bt.loadNewick(input_dir + "/" + treefile, absoluteTime=False) old_node = tree.root new_node = bt.node() new_node.children.append(old_node) old_node.parent = new_node old_node.length=2.0 new_node.height = 0 new_node.y = old_node.y tree.root = new_node tree.Objects.append(new_node) tips = [] for k in tree.Objects: if k.branchType == 'leaf': tips.append(k.name) if len(tips) < 1000: overall_tree_count += 1 if desired_fields == []: colour_by = ["adm1"] else: colour_by = desired_fields for trait in colour_by: colour_dict = find_colour_dict(query_dict, trait) colour_dict_dict[trait] = colour_dict make_scaled_tree_without_legend(tree, treename, input_dir, len(tips), colour_dict, trait, tallest_height, lineage, taxon_dict, query_id_dict, query_dict) else: too_tall_trees.append(lineage) continue return too_tall_trees, overall_tree_count, colour_dict_dict
def make_all_of_the_trees(input_dir, outdir, tree_name_stem, taxon_dict, query_dict, colour_fields, label_fields, min_uk_taxa=3): tallest_height = find_tallest_tree(input_dir) too_tall_trees = [] colour_dict_dict = defaultdict(dict) overall_df_dict = defaultdict(dict) overall_tree_count = 0 lst = sort_trees_index(input_dir) for trait in colour_fields: colour_dict = find_colour_dict(query_dict, trait) colour_dict_dict[trait] = colour_dict for tree_number in lst: treename = f"tree_{tree_number}" treefile = f"{tree_name_stem}_{tree_number}.tree" nodefile = f"{tree_name_stem}_{tree_number}" num_taxa = 0 tree = bt.loadNewick(input_dir + "/" + treefile, absoluteTime=False) old_node = tree.root new_node = bt.node() new_node.children.append(old_node) old_node.parent = new_node old_node.length = 0.000015 new_node.height = 0 new_node.y = old_node.y tree.root = new_node tree.Objects.append(new_node) tips = [] for k in tree.Objects: if k.branchType == 'leaf': tips.append(k.name) if len(tips) < 1000: df_dict = summarise_node_table(input_dir, nodefile, taxon_dict) overall_df_dict[treename] = df_dict overall_tree_count += 1 make_scaled_tree(tree, nodefile, input_dir, outdir, len(tips), colour_dict_dict, colour_fields, label_fields, tallest_height, tree_number, taxon_dict, query_dict) else: too_tall_trees.append(tree_number) continue return too_tall_trees, overall_tree_count, overall_df_dict, colour_dict_dict
@author: david """ from Bio import Phylo import baltic as bt import re # '1-2936','2937-4936', '4937-6870','6871-8473','8474-19706','19707-20428' in_tree = 'sars-like-CoVs-sub_19707-20428.tre' out_tree = 'sars-like-CoVs-sub_19707-20428.nexus' # Basic conversion with Phylo module doesn't work here because we need BEAST-style NEXUS files #Phylo.convert(in_tree, 'newick', out_tree, 'nexus') myTree = bt.loadNewick(in_tree, absoluteTime=False) #myTree.setAbsoluteTime(2019.00) # need to set this to time of last sampled tip myTree.traverse_tree() ## required to set heights myTree.treeStats() ## report stats about tree names = [] for idx, k in enumerate( myTree.Objects): ## iterate over a flat list of branches if k.branchType == 'leaf': curr_name = k.numName names.append(curr_name) #print(names) date_str = '_2020.00' # Write taxa names
help="Pruned tree including only taxa in list") args = parser.parse_args() timetree = args.timetree tmrcas = args.tmrcas output = args.output # path = "/Users/anderson/GLab Dropbox/Anderson Brito/past&future/PhD/works/phylog/species_trees/viral_sppTrees/rhv04_virevol1/trees/ba/run0_host/" # timetree = path + "host_tree.tree" # tmrcas = path + 'tmrcas.txt' # output = path + 'new_host_tree.nexus' all_traits = ['node_name', 'height_95%_HPD'] # load tree tree = bt.loadNewick(timetree) #, tip_regex='_([0-9\-]+)$') # print(tree) # tmrca dataframe df = pd.read_csv(tmrcas, encoding='utf-8', sep='\t', dtype='str') # print(df) # df['members'] = df['members'].apply(lambda x: ', '.join(sorted(x.split(',')))) # print(df['members'].to_list()) print('Starting tree file processing...') # transfer supporting value from a newick tree for k in sorted(tree.Objects, key=lambda q: q.height ): ## iterate over branches from most recent to oldest if k.branchType == 'node': ## can only sort nodes terminals = ", ".join(sorted([leaf for leaf in k.leaves]))