class NTBDB(object): def __init__(self, imgs_dir='/storage/imgs/low-res', metadata_dir='/storage/metadata'): self.metadata_dir = metadata_dir self.imgs_dir = imgs_dir with open(os.path.join(self.metadata_dir, 'metadata.pickle')) as md: self.metadata = pickle.load(md) for img in EXCLUDE_PICS: del self.metadata[img] self.by_tag = dict() for p in self.metadata.itervalues(): for tag in p['tags']: self.by_tag.setdefault(tag, []).append(p) self.tags = Tree(os.path.join(self.metadata_dir, 'tags.nw'), format=8) self.tag_by_name = {tag.name: tag for tag in self.tags.get_descendants()} def by_tag_with_children(self, tag_name): tag_node = self.tags.search_nodes(name=tag_name)[0] all_tags = [tag_node] all_tags.extend(tag_node.get_descendants()) return list(itertools.chain.from_iterable(self.by_tag.get(tag.name, []) for tag in all_tags)) def tag_score(self, tag): return len(self.by_tag.get(tag.name, [])) + sum(map(self.tag_score, tag.children)) def top_tags(self, max_children=5, max_depth=2): top_tags = self.tags.copy() for n in top_tags.traverse(): n.children = sorted(n.children, key=self.tag_score, reverse=True)[:max_children] if n.get_distance(n.get_tree_root()) > max_depth - 1: n.children = [] return top_tags def image_path(self, image_index): return os.path.join(self.imgs_dir, self.metadata[image_index]['folder'], self.metadata[image_index]['filename'] + '.jpg')
def __init__(self,tree:Tree,cluster_feature='accs'): """ constructor for EteMplTree. Calls: self.cluster_size() to add feature .cluster_relsize to each leaf node Arguments: tree: an ete3 tree instance [Also currently makes some assumed settings that are configurable public properties- -orientation,cluster_feature,cluster_viz,scale] Keyword Arguments: cluster_feature: what feature to use as indication of cluster size: None, 'accs' (default='accs') """ self.tree=tree.copy() self.orientation='left' self.scale=1.0 self.dashed_leaves=True self.cluster_viz='triangle' self.cviz_symboldict={'left':'<','right':'>','top':'^','bottom':'v'} self.cviz_hadict={'left':'left','right':'right','top':'center','bottom':'center'} self.cviz_vadict={'left':'center','right':'center','top':'top','bottom':'bottom'} self.tree_lw=3.0 self.tree_color='black' self.initial_leafspacing=0.1 self.create_leaf_names=False self.draw_leaf_names=False self.cluster_feature=cluster_feature self.set_cluster_size() self.plot_coords=[[np.inf,-np.inf],[np.inf,-np.inf]] self.ordered_leaves=None self.decorated_plot_coords=None
def collapse_unifurcations(tree: ete3.Tree) -> ete3.Tree: """Collapse unifurcations. Collapse all unifurcations in the tree, namely any node with only one child should be removed and all children should be connected to the parent node. Args: tree: tree to be collapsed Returns: A collapsed tree. """ collapse_fn = lambda x: (len(x.children) == 1) collapsed_tree = tree.copy() to_collapse = [n for n in collapsed_tree.traverse() if collapse_fn(n)] for n in to_collapse: n.delete() return collapsed_tree
def write_newick_tree_with_uncoded_names(infile, outfile, tablefile, quoted_names=False): """Take a text file with a newick tree, and write a new one with names converted back to original names (given values in a conversion table), and optionally put quotation marks around the names in the output tree. """ # Generate a dictionary for converting names. conv_dict = get_conversion_dict_from_table(tablefile) ## Look for each code in the input tree, and replace with original name. #tree_string = None #with open(infile) as infh: # tree_string = infh.readline() #for code in conv_dict: # if quoted_names: # tree_string = tree_string.replace(code, '"' + conv_dict[code] + '"') # else: # tree_string = tree_string.replace(code, conv_dict[code]) #t1 = Tree(infile) t1 = Tree(infile, format=1) t2 = t1.copy() for node in t2.traverse(): if node.is_leaf(): for x in conv_dict.keys(): if node.name.strip('\'').replace(' ', '_') == x.strip('\'').replace( ' ', '_'): node.name = conv_dict[x] # Write uncoded tree to output file. #with open(outfile, 'w') as o: #o.write(tree_string) #o.write(t2) t2.write(outfile=outfile, format=1)
def write_clustertree_tonewick(ct: Tree, otfpath: str = 'clustertree.nw'): """redefines node names property according to child leaf accesion codes and writes out as newick tree Arguments: ctree: ete tree clustered using ete_clustertree_bysize, or ...? (contains 'subtrees' feature) Keyword Arguments: otfpath: path of output tree (default='clustertree.nw') Returns: int value number of nodes """ ct = ct.copy() for lnode in ct.get_leaves(): if 'subtrees' in lnode.features: lnode.name = '' for st in lnode.subtrees: for acc in st.get_leaf_names(): lnode.name += f'{acc}|' else: lnode.name += '|' ct.write(outfile=otfpath, features=['name']) print(f'clusterified newick tree written to {otfpath}')
def expand_eteclustertree(ct: Tree, delete_cluster_names=True, delete_cluster_features=True): """expands cluster tree to original topology. Collapsed sisters are inferred from a cluster node with dist=0 to parent. Returned tree will have some differences in node names from parent? Arguments: ct: the cluster tree Keyword Arguments: delete_cluster_names: whether to delete cluster names (default=True) delete_cluster_features: whether to delete cluster features accs,cluster_numleaves,subtrees (default=True) Returns: ete tree expanded so that leaves represent single enzymes """ ct = ct.copy() for lnode in ct.get_leaves(): if 'subtrees' in lnode.features: #special handling for collapsed sisters if lnode.dist == 0: print(f'sister node detected for {lnode.name}') for st in lnode.subtrees: lnode.up.add_child(st) lnode.detach() else: for st in lnode.subtrees: lnode.add_child(st) if delete_cluster_names: lnode.name = '' if delete_cluster_features: lnode.del_feature('subtrees') lnode.del_feature('cluster_numleaves') lnode.del_feature('accs') #need to do a check... is this also proper handling collapsed sisters? return ct
def partitionTreeSet(N): if N == 1: x = Tree(";",format=100) x.add_features(value=N, name=str(N)) xFace = styleFace(x.name) x.add_face(xFace,column=0,position="branch-top") return (x,) else: y = () base = Tree(";",format=100) base.dist = 1 for k in range(lam(N)): left = partitionTreeSet(N-(k+1)) right = partitionTreeSet(k+1) for l in left: for r in right: l.dist = 1 r.dist = 1 z = base.copy() z.dist = 1 z.add_features(value=N, name=str(N)) z.add_child(l.copy()) z.add_child(r.copy()) zFace = styleFace(z.name) z.add_face(zFace,column=0,position="branch-top") y = y + (z,) return y
with open("representative_tree/bacteria_random_fam_reps.txt", "w") as handle: handle.writelines([k + "\t" + v + "\n" for k, v in fam2rnd.items()]) rnds = set(fam2rnd.values()) nodes = list(tree.iter_leaves()) sub_tree = tree.prune( [n for n in tree.iter_leaves() if "mOTU" in n.name or n.name in rnds], preserve_branch_length=True) sub_tree.write(outfile="representative_tree/bacteria_pruned.tree") motu_md = pandas.read_csv("metadata/mOTU_stats.csv", index_col=0) gtdbid2fam.update(motu_md.consensus_tax.to_dict()) ptree = tree.copy() for l in ptree.iter_leaves(): l.name = gtdbid2fam[l.name] ptree.write(outfile="representative_tree/bacteria_pretty.tree") motu_md = pandas.read_csv("metadata/mOTU_stats.csv", index_col=0) tt = set(list(tree.iter_leaf_names())).difference(set(motu_md.index)) gtdb_md = pandas.read_csv("/home/moritz/data/gtdb/bac120_metadata_r89.tsv", index_col=0, sep="\t") mag_md = pandas.read_csv("metadata/master_table.csv", index_col=0) tt2 = mag_md.loc[motu_md.representative_MAGs] tt2.index = tt2.mOTU
output_file_path = cmdln[2] # Define query ID present in filename. query_id = os.path.basename(tf)[2:].rsplit('_', 1)[0] # Initiate a tree style. ts = TreeStyle() ts.show_leaf_name = False # Parse tree. #t1 = Tree(tf, format=3) t1 = Tree(tf, format=0) #print(t1) # Make a copy of the TreeNode object. t2 = t1.copy() # Root on midpoint. t2.set_outgroup(t2.get_midpoint_outgroup()) # Add node support values as branch labels. add_support_to_nodes_as_faces(t2) # Customize the node styles generally. customize_node_styles_for_visualization(t2) ##################################################### # Write tree to pdf. if platform == "linux" or platform == "linux2":
def plot_trees_from_traces(input_trace, output_plot, simu_dict, simu_tree): axis_trees, axis_filenames = dict(), dict() for filepath in input_trace: for tree_path in sorted(glob("{0}.*.nhx".format(filepath))): feature = remove_units( tree_path.replace(filepath + ".", "").replace(".nhx", "")) filename = os.path.basename(filepath) with open(tree_path, 'r') as tree_file: tree_str = remove_units(tree_file.readline()) if tree_str.count("-nan") > 0: continue tree = Tree(tree_str, format=1) if simu_tree: for n_inf, n_simu in zip(tree.traverse(), simu_tree.traverse()): assert (sorted(n_simu.get_leaf_names()) == sorted( n_inf.get_leaf_names())) if feature not in axis_trees: axis_trees[feature] = [] if feature not in axis_filenames: axis_filenames[feature] = [] axis_filenames[feature].append(filename) axis_trees[feature].append(tree) if len([n for n in tree.traverse() if feature in n.features]) == len(list(tree.traverse())): plot_tree( tree.copy(), feature, "{0}/{1}.{2}.pdf".format(output_plot, filename, feature)) for feature in axis_trees: axis_dict, err_dict, std_dict = dict(), dict(), dict() if feature in simu_dict: axis_dict["Simulation"] = simu_dict[feature] for filename, tree in zip(axis_filenames[feature], axis_trees[feature]): values = np.array([ to_float(getattr(n, feature)) for n in tree.traverse() if feature in n.features ]) min_values = np.array([ to_float(getattr(n, feature + "_min")) for n in tree.traverse() if feature + "_min" in n.features ]) max_values = np.array([ to_float(getattr(n, feature + "_max")) for n in tree.traverse() if feature + "_max" in n.features ]) axis_dict[filename] = values err_dict[filename] = np.vstack( (np.abs(values - min_values), np.abs(max_values - values))) std_dict[filename] = np.array([ to_float(getattr(n, feature + "_std")) for n in tree.traverse() if feature + "_std" in n.features ]) if len(axis_dict) > 1: path = '{0}/correlation.{1}.pdf'.format(output_plot, feature) plot_correlation(path, axis_dict, err_dict, std_dict=std_dict, global_min_max=False)
children, tree_string = cvtNewick(parent) orig = Tree(tree_string, format=8) out = open(join(args.out_dir, args.out), "w") outbase = splitext(args.out)[0] if args.format == "plink": if len(args.sample) != 1: sys.exit( "When using PLINK fomat there can only be some sample -- the base of a plink data set" ) (bim, fam, bed) = read_plink(args.sample[0], verbose=False) N = len(fam) if args.num == 0 else args.num for p in range(N): tr = orig.copy(method='deepcopy') if fam.fid[p] == fam.iid[p]: the_id = fam.fid[p] else: the_id = fam.fid[p] + "_" + fam.iid[p] print(p, the_id) data = getInd(p, bed, bim) processSample(out, the_id, data) else: N = len(args.sample) if args.num == 0 else args.num for sample in args.sample[:N]: tr = orig.copy(method='deepcopy') processSample(out, sample, open(sample)) out.close() group_names = [g for g in overall.keys() if overall[g][count] > 0]
def keep_subsequent_wgd_species(stree, ensembl_tree, missing_leaves_keep, sp_current_wgd, authorized_sp): """ When re-grafting a subtree corrected for species descending from 'WGD1' only, keep positions of species with subsequent WGDs consistent in the tree. To do so, find the closest 'WGD1-only' species gene in ensembl tree and keep subsequently duplicated species genes at the same position (relative to it). Modifies `stree` in-place. Args: stree (ete3.Tree): Tree object for the synteny corrected tree of WGD1 ensembl_tree (ete3.Tree): Tree object for the full original gene tree missing_leaves_keep (list of ete3.TreeNode): Genes of subsequently duplicated species sp_current_wgd (list of str): List of WGD1 duplicated species authorized_sp (dict): Dict used to keep the tree consistent with the species tree. For a 'WGD1' species, a list of WGD1 species that are closer to it than are 4R species. """ #genes in the WGD1 corrected tree sleaves = [i.name for i in stree.get_leaves()] stree.prune([i for i in stree.get_leaves()]) #genes in subsequent WGDs missing = [i.name for i in missing_leaves_keep] #original tree enstree = ensembl_tree.copy() #find all clades of subsequent WGD genes to replace them at a correct position together for leaf in enstree.get_leaves(): if leaf.name in missing: leaf.missing = "Y" clades = enstree.get_monophyletic(values=["Y"], target_attr="missing") closest_gene = {} #for each clade to replace for clade in clades: #find closest neighbour in the original tree which is in WGD1 duplicated species outgr_gene = closest_gene_in_tree(enstree, clade, sp_current_wgd, attr='S') if outgr_gene.name in sleaves: closest_gene[clade] = outgr_gene #if the closest WGD1 gene is in the WGD1 stree #we'll keep 4R genes in the family, at a similar position for outgr_gene in set(closest_gene.values()): clades = [i for i in closest_gene if closest_gene[i] == outgr_gene] clades = list(chain.from_iterable(clades)) subtree_4r = enstree.copy() subtree_4r.prune([i.name for i in clades] + [outgr_gene.name]) outgroup_subtree = stree.copy() outgr = stree.get_leaves_by_name(name=outgr_gene.name)[0] sister_outgroup_genes = [outgr_gene.name] find_sister_of_outgroup(outgr, authorized_sp[outgr_gene.S], sister_outgroup_genes) #We keep all sister outgroup genes together in the corrected tree if len(sister_outgroup_genes) > 1: #stree is modified in-place subtree_4r = keep_sis_genes_together(subtree_4r, outgr_gene.name, sister_outgroup_genes, outgroup_subtree, node_max='') lca = stree.get_common_ancestor(sister_outgroup_genes) # cop = stree.copy() stree.prune([lca] + [i for i in stree.get_leaves()\ if i.name not in sister_outgroup_genes]) else: lca = outgr #in case we do not paste the subtree at a terminal node (cleared above) if len(lca.children) == 2: lca_cop = lca.copy() tmp = Tree() lca_cop.prune([i for i in lca_cop.get_leaves()]) tmp.add_child(lca_cop.copy()) tmp.add_child(subtree_4r.copy()) lca.up.add_child(name='here') lca.detach() lca_new = stree.search_nodes(name="here")[0] lca_new.add_child(tmp.copy()) lca_new.name = '' else: lca.name = '' lca.add_child(subtree_4r) #remove potential signle child internal nodes artefact stree.prune([i for i in stree.get_leaves()]) #clean up attributes for leaf in stree.get_leaves(): if hasattr(leaf, 'missing'): delattr(leaf, 'missing')
def create_experiment(prefix, name, sample, replicate, tree_name, cds_list, lht, calibs, intersection, screen, sbatch, nbr_cpu, random_state): root_path = os.getcwd() + "/" + name tree = Tree("{0}/{1}".format(root_path, tree_name), format=1) print("{0} extant species found for the rooted tree/".format(len(tree))) if os.path.isfile('{0}/{1}'.format(root_path, cds_list)): print("Found list of CDS : " + cds_list) genes = pd.read_csv("{0}/{1}".format(root_path, cds_list), header=None) else: genes = pd.DataFrame( [i.replace(".ali", "") for i in os.listdir("{0}/singlegene_alignments".format(root_path)) if ".ali" in i]) print("{0} CDS provided.".format(len(genes))) if replicate == -1: replicate = len(genes) for rep in range(replicate): random_state += 654 experiment = prefix + "_{0}_{1}_{2}_Sample{3}_Replicates{4}_Id{5}".format(name, tree_name, cds_list, sample, replicate, rep) exp_path = os.getcwd() + '/Experiments/' + experiment os.makedirs(exp_path, exist_ok=True) os.system('cp config.yaml {0}'.format(exp_path)) os.remove(exp_path + "/Snakefile") if os.path.exists(exp_path + "/Snakefile") else None os.symlink(os.getcwd() + "/Snakefile", exp_path + "/Snakefile") if os.path.isfile('{0}/{1}'.format(root_path, lht)): print("Life-History-Traits file provided (" + lht + ")") os.system('cp {0}/{1} {2}/life_history_traits.tsv'.format(root_path, lht, exp_path)) if os.path.isfile('{0}/{1}'.format(root_path, calibs)): print("Fossil Calibrations file provided (" + calibs + ")") os.system('cp {0}/{1} {2}/calibs.tsv'.format(root_path, calibs, exp_path)) if os.path.isfile('{0}/known_population_size.tsv'.format(root_path)): print("Known population size file provided (known_population_size.tsv)") os.system('cp {0}/known_population_size.tsv {1}'.format(root_path, exp_path)) alignments = [] taxa = set(tree.get_leaf_names()) if intersection else set() if sample == -1: vals = [genes.loc[rep, :]] else: vals = genes.sample(sample, random_state=random_state).values pd.DataFrame(vals).to_csv(exp_path + "/CDS.list", index=False, header=None) for selected in vals: alignments.append(import_ali("{0}/singlegene_alignments/{1}.ali".format(root_path, selected[0]))) taxa = taxa.intersection(alignments[-1].keys()) if intersection else taxa.union(alignments[-1].keys()) taxa = taxa.intersection(set(tree.get_leaf_names())) merge_alignment = {k: "" for k in taxa} for alignment in alignments: seq_len_set = set([len(s) for s in alignment.values()]) assert (len(seq_len_set) == 1) size = seq_len_set.pop() for taxon in taxa: if taxon in alignment: merge_alignment[taxon] += alignment[taxon] else: merge_alignment[taxon] += "-" * size export_ali(exp_path + "/CDS.ali", merge_alignment) trimmed_tree = tree.copy() trimmed_tree.prune(taxa, preserve_branch_length=True) print("{0} taxa for replicate {1}".format(len(taxa), rep + 1)) trimmed_tree.write(outfile="{0}/rootedtree.nhx".format(exp_path), format=1) assert (set(merge_alignment.keys()) == set( Tree("{0}/rootedtree.nhx".format(exp_path), format=1).get_leaf_names())) run_file = exp_path + "/snakeslurm.sh" with open(run_file, 'w') as w: w.write("#!/usr/bin/env bash\n") run_str = 'snakemake ' if sbatch: run_str += '-j 99 --cluster "sbatch -J {0} -p long -N 1 ' \ '-o {1}/slurm.%x.%j.out -e {1}/slurm.%x.%j.err '.format(experiment, exp_path) run_str += '--cpus-per-task={params.threads} --mem={params.mem} -t {params.time}"\n' else: run_str += "-j {0} --printshellcmds".format(nbr_cpu) w.write(run_str) os.system("chmod 755 " + run_file) cmd = 'cd ' + exp_path + ' && ./snakeslurm.sh' screen_cmd = 'screen -dmS ' + "{0}_{1}_{2}".format(prefix, name, rep) + ' bash -c "' + cmd + '"' with open(exp_path + "/screen.sh", 'w') as w: w.write("#!/usr/bin/env bash\n") w.write(screen_cmd) if screen: print(screen_cmd) run(screen_cmd, shell=True) else: print(cmd) run(cmd, shell=True)
gl = pd.read_csv(inputGLF, sep = "\t") t = Tree(spTreeF) t.sort_descendants(attr='O') ts = TreeStyle() ts.complete_branch_lines_when_necessary = False # calculate branch colors gainL = [] # list with all rates of gain lossL = [] # list with all rates of loss gm = gl.rgain.min() gM = gl.rgain.max() lm = gl.rloss.min() lM = gl.rloss.max() #bcrg = scaleCol(gl.pgain.tolist()) # Branch Colors for Rates of Gain #bcrl = scaleCol(gl.ploss.tolist()) # Branch Colors for Rates of Loss # make a "gain" and a "loss" copy of the tree tg = t.copy() tl = t.copy() gcm = cm.ScalarMappable(norm = colors.Normalize(vmin = gm, vmax = gM), cmap = "coolwarm") lcm = cm.ScalarMappable(norm = colors.Normalize(vmin = lm, vmax = lM), cmap = "coolwarm") for node in tg.iter_descendants(): # do not include root if node.up.is_root(): rgain = gl.loc[(gl.fromNode == 0) & (gl.toNode == int(node.ND)), 'rgain'] else: rgain = gl.loc[(gl.fromNode == int(node.up.ND)) & (gl.toNode == int(node.ND)), 'rgain'] if rgain.empty: continue rgain = rgain.item() style = NodeStyle() gainString = "%.2f" % (rgain) #pick colors ci = colors.rgb2hex(gcm.to_rgba(rgain)[:3])
def ete_cluster_bysize(t:Tree,cluster_maxsize:int=50,cluster_minsize:int=5,\ collapse_sisters:bool=True,cleanup_merge=True,outgroup:str=None): """takes ete3 tree, then groups sets of leaf nodes and truncates tree at common ancestor. merge_sister_orphans options can be useful just for visualization purposes if tree has many polytomies Arguments: t: ete3 tree object Keyword Arguments: cluster_maxsize: maximum size for each cluster (default=50) cluster_minsize: minimum size for each cluster (default=5) collapse_sisters: whether to merge leaves or groups<threshold into a single branch (default True) cleanup_merge: whether to perform final step that clusters all subtrees even if size<cluster_minsize outgroup: name of outgroup node (default=None) Returns: ete tree file with added feature 'subtrees', 'cluster_numleaves', and 'accs' for collapsed nodes, which correspond to a list of child nodes, number of leaves, and all (exapanded) leaf names in the cluster. Collapsed nodes names: 'm_<numsubtrees>_<numleaves>', 's_<numsubtrees>_<numleaves>', 'c_<numsubtrees>_<numleaves>' """ t = t.copy() if outgroup is not None: t.set_outgroup(t & outgroup) #Breadth-First Tree Traversal, stop when no.leaves<cluster_maxsize orphans = [] tovisit_ = [t] print(f'--starting number of leaves: {len(t.get_leaf_names())}--') cluster_merges = [] while (len(tovisit_) > 0): node = tovisit_.pop() lnames = node.get_leaf_names() numleaves = len(lnames) if numleaves < cluster_maxsize: groupaccs_ = node.get_leaf_names() if len(groupaccs_) > cluster_minsize: node.add_feature('cluster_numleaves', len(node.get_leaf_names())) node.add_feature('subtrees', [nc.detach() for nc in node.get_children()]) # node.name = f'm_{len(node.subtrees)}_{node.cluster_numleaves}' cluster_merges.append(len(groupaccs_)) else: orphans.append(node) else: tovisit_.extend(node.children) print(f'cluster collapse sizes: {cluster_merges}') #merge sister orphans sister_merges = [] if collapse_sisters: while (len(orphans) > 0): cur_orphan = orphans.pop() pnode = cur_orphan.up sisters = cur_orphan.get_sisters() sis_orphs = [] for orphos in range(len(orphans) - 1, -1, -1): if orphans[orphos] in sisters: sis_orphs.append(orphans.pop(orphos)) if len(sis_orphs) > 1: size_of_merged = len(cur_orphan.get_leaf_names()) + np.sum( [len(x.get_leaf_names()) for x in sis_orphs]) if size_of_merged > cluster_minsize: newnode = pnode.add_child(dist=0) newnode.add_feature('cluster_numleaves', size_of_merged) newnode.add_feature('subtrees', [cur_orphan.detach()]) for so in sis_orphs: newnode.subtrees.append(so.detach()) newnode.name = f's_{len(newnode.subtrees)}_{newnode.cluster_numleaves}' sister_merges.append(size_of_merged) print(f'sisters collapse sizes: {sister_merges}') #now cleanup_merge if cleanup_merge: visited = set() while len(visited) < len(t.get_leaves()): for n in set(t.get_leaves()).difference(visited): visited.add(n) nosubtrees = 'subtrees' not in n.features addn = None #if no subtrees, see how far we can climb while nosubtrees: #if we've climbed once, this is a cluster if len(n.get_descendants()) > 0: addn = n n = n.up #continue only if no descendants have subtree subtree_status = [ 'subtrees' in x.features for x in n.traverse() ] nosubtrees = True not in subtree_status #clusterify the node, then add it and all sub-leaves to the visited set if addn is not None: addn.add_feature('cluster_numleaves', len(addn.get_leaf_names())) addn.add_feature( 'subtrees', [nc.detach() for nc in addn.get_children()]) # addn.name = f'c_{len(addn.subtrees)}_{addn.cluster_numleaves}' visited = visited.union([x for x in addn.get_leaves()]) break #break to reset leaf candidates with updated visited set as filter #now at end add accs feature (a list of acc under each) for lnode in t.get_leaves(): lnode.add_feature('accs', []) if 'subtrees' in lnode.features: for st in lnode.subtrees: lnode.accs.extend( st.get_leaf_names() ) #lnode.accs=[*y for y in [x.get_leaf_names() for x in lnode.subtrees]] else: lnode.accs.append(lnode.name) #final consistency check and readout num_leaves = 0 for lnode in t.get_leaves(): if 'subtrees' in lnode.features: num_leaves += np.sum( [len(x.get_leaf_names()) for x in lnode.subtrees]) #num_leaves+=lnode.cluster_numleaves#np.sum([len(x.get_leaf_names()) for x in lnode.subtrees]) else: num_leaves += 1 print(f'--total leaves at end: {num_leaves}--') return t
os.waitpid(p.pid, 0) r1 = list(map(float, p1.split())) r2 = list(map(float, p2.split())) nTriplets = r1[1] nUnresolved = r2[-2] nResolved = nTriplets - nUnresolved if nResolved == 0: return np.nan nAgree = r1[4] return 1. - nAgree/nResolved results = [] for fam in families: print(fam) fmGlot = glot.copy() tree = Tree('madRooted/' + fam + '.madRooted.tre') fmGlot.prune(tree.get_leaf_names()) fmResults = [len(tree)] fmResults.append(gtd(Tree('madRooted/' + fam + '.madRooted.tre'), fmGlot)) fmResults.append(gtd(Tree('midpointRooted/' + fam + '.midpointRooted.tre'), fmGlot)) fmResults.append(gtd(Tree('outgroupRooted/' + fam + '.outgroupRooted.tre'), fmGlot)) fmResults.append(gtd(Tree('yuleRooted/' + fam + '.yuleRooted.tre'), fmGlot)) results.append(fmResults) results = pd.DataFrame(results, index=families,
count = 0 for i in range(1, 5): S[i] = dict() for j in range(1, 5): S[i][j] = dict() for k in range(5, 9): sum = 0 for s in P[k].keys(): t = Tree(s, format=9) num = 0 for leaf in t: leaf.name = str(num) num += 1 count = 0 for Q in combinations(range(k), 4): Q = list(map(str, Q)) TQ = t.copy() TQ.prune(Q) for Qp in combinations(range(k), 4): Qp = list(map(str, Qp)) TQp = t.copy() TQp.prune(Qp) if len(set(Q).intersection(set(Qp))) == 8 - k and shape(TQ) == T4[i] and shape(TQp) == T4[j]: count += 1 sum += simplify(q[i]*q[j]*count*P[k][s]) S[i][j][k] = simplify(sum) count += 1 print "round", count, "of", 64 print >>out, i, j, k, S[i][j][k]
class TreeHolder: def __init__(self, tree, logger, scale=None, labels_dict=None, node_colors=defaultdict(lambda: 'black')): self.tree = Tree(tree) self.scale = scale for node in self.tree.traverse(): if len(node.children) == 3: logger.info("Trying to root tree by first child of root") logger.info(f'Children of root: {node.children}') self.tree.set_outgroup(node.children[0]) break for node in self.tree.traverse(): # Hide node circles node.img_style['size'] = 0 if node.is_leaf(): try: name_face = TextFace( labels_dict[node.name] if labels_dict else node.name, fgcolor=node_colors[node.name]) except KeyError: msg = f'There is not label for leaf {node.name} in labels file' logger.error(msg) raise KeyError(msg) node.add_face(name_face, column=0) def draw_neighbours(self, neighbours, block, colors=('Crimson', 'Teal', 'DarkGreen', 'Purple', 'DarkKhaki', 'MediumVioletRed', 'DarkOrange', 'Navy', 'RosyBrown', 'DarkGoldenrod', 'Sienna', 'Indigo', 'DarkRed', 'Olive', 'SlateGray', 'SeaGreen', 'IndianRed', 'BurlyWood')): posible_ns = sorted( list( set(n[:-1] for nss in neighbours.values() for ns in nss for n in ns[:2]))) ns_colors = { posible_ns[i]: colors[i % len(colors)] for i in range(len(posible_ns)) } ns_colors[str(block)] = 'grey' # if block != 2: return all_genomes = [node.name for node in self.tree.traverse()] aligned_neighbours = align_neighbours(neighbours, all_genomes) offsets = get_offsets(aligned_neighbours) for node in self.tree.traverse(): if not node.is_leaf(): continue face = generate_neighbour_face(aligned_neighbours[node.name], ns_colors, block, offsets) node.add_face(face, 1, "aligned") def draw(self, file, colors, color_internal_nodes=True, legend_labels=(), show_branch_support=True, show_scale=True, legend_scale=1, mode="c", neighbours=None, neighbours_block=None): max_color = len(colors) used_colors = set() for node in self.tree.traverse(): if not (color_internal_nodes or node.is_leaf()): continue color = colors[min(node.color, max_color - 1)] node.img_style['bgcolor'] = color used_colors.add(color) ts = TreeStyle() ts.mode = mode ts.scale = self.scale # Disable the default tip names config ts.show_leaf_name = False ts.show_branch_support = show_branch_support # ts.branch_vertical_margin = 20 ts.show_scale = show_scale cur_max_color = max(v.color for v in self.tree.traverse()) current_colors = colors[0:cur_max_color + 1] for i, (label, color_) in enumerate(zip(legend_labels, current_colors)): if color_ not in used_colors: continue rf = RectFace(20 * legend_scale, 16 * legend_scale, color_, color_) rf.inner_border.width = 1 rf.margin_right = 14 rf.margin_left = 14 tf = TextFace(label, fsize=26 * legend_scale) tf.margin_right = 14 ts.legend.add_face(rf, column=0) ts.legend.add_face(tf, column=1) if neighbours: old_tree = self.tree.copy() self.draw_neighbours(neighbours, neighbours_block) self.tree.render(file, w=1000, tree_style=ts) if neighbours: self.tree = old_tree def get_all_leafs(self): return {node.name for node in self.tree.get_leaves()} def count_innovations_fitch(self, leaf_colors, count_second_color=True): def assign_colorset_feature(v): if v.is_leaf(): v.add_features(colorset={leaf_colors[v.name]}, color=leaf_colors[v.name]) else: try: child1, child2 = v.children except ValueError: print(v.children) raise ValueError('Tree must me binary') cs1 = assign_colorset_feature(child1) cs2 = assign_colorset_feature(child2) v.add_features( colorset=(cs1 & cs2) if len(cs1 & cs2) > 0 else cs1 | cs2) return v.colorset def chose_color(colorset): return sorted(colorset, key=lambda c: color_counter[c], reverse=True)[0] def down_to_leaves(v, color): if v.is_leaf(): return v.add_features(color=color if color in v.colorset else chose_color(v.colorset)) for child in v.children: down_to_leaves(child, v.color) def count_innovations(v, innovations): for child in v.children: if v.color != child.color and not (not count_second_color and (v.color == 2) or (child.color == 2)): innovations[child.color].append(child) count_innovations(child, innovations) color_counter = Counter(leaf_colors.values()) # get colorsets for internal nodes root = self.tree.get_tree_root() assign_colorset_feature(root) # get color for internal nodes root_color = chose_color(root.colorset) down_to_leaves(root, root_color) # get inconsistent colors self.innovations = defaultdict(list) count_innovations(root, self.innovations) def count_parallel_rearrangements(self, skip_grey): score, count, count_all = 0, 0, 0 for color, nodes in self.innovations.items(): if len(nodes) <= 1 or (skip_grey and color == 1): continue count += 1 count_all += len(nodes) for n1, n2 in combinations(nodes, 2): score += n1.get_distance(n2) return score, count, count_all def count_parallel_breakpoints(self): count = sum(map(len, self.innovations.values())) score = sum( n1.get_distance(n2) for n1, n2 in combinations((n for ns in self.innovations.values() for n in ns), 2)) return score, count def draw_coloring(self, file): for node in self.tree.traverse(): node.img_style['bgcolor'] = self.colors[node.color] ts = TreeStyle() ts.show_leaf_name = False self.tree.render(file, w=1000, tree_style=ts) def prune(self, ls): self.tree.prune(list(ls))
spID += 1 node.sp = spID for leaf in node: try: leaf.sp = spID except AttributeError: leaf.add_features(sp=1) if (ms_input == True) and (ms_islands[-1] > largest_id): sys.exit("2. The MS island structure you provided through -I does not fit the number of tips in the demography. The option might have been mispecified. Check it out!") sys.stdout.write('S') # SpreadSpeciation # if requested, print speciational tree if plot_trees: tmut = t.copy() for leaf in tmut: leaf.name = "["+str(leaf.sp)+"]"+leaf.name tmut.render(ophylo+"_2MUT.png", w=183, units="mm") #======================================================# # CONVERT demography to phylogeny using a traversing method # __/!\__ to be modified for non dichotomic trees (eg Lambda coalescent) traversedNodes = set() for node in t.traverse("preorder"): if node not in traversedNodes:
def generax2mcmctree(xml_file, stree, gene, dating_o, calbration_file, genome2cog25={}): """ generate files for mcmctree, including 1. list of used genomes 2. used genomes (itol annotation) 3. constructed species tree topology for dating 4. target genomes in the complete phylogeny of phylum (itol annotation) 5. calibration file and tree file with calibrations information """ # xml_file = join(r_odir,f'reconciliations/{gene}_reconciliated.xml') # stree = f"./trees/iqtree/{phylum_name}.reroot.newick" phylum_name = xml_file.split('/')[-4] st = Tree(stree, format=3) tmp_name = phylum_name + '_' + gene _p2node, _p2node_transfer_receptor = get_p2node(xml_file, stree, key=tmp_name) target_nodes = list(_p2node.values())[0] + list( _p2node_transfer_receptor.values())[0] must_in_genomes = open( "/mnt/home-backup/thliao/cyano_basal/rawdata/assembly_ids.list").read( ).strip('\n').split('\n') # new calibrations are /mnt/home-backup/thliao/cyano/ref_genomes_list.txt cluster2genomes = get_cluster( stree.replace('.reroot.newick', '.clusterd.list')) g2cluster = {v: c for c, d in cluster2genomes.items() for v in d} retained_ids = sampling(st, target_nodes, must_in=must_in_genomes, node2cluster=g2cluster, genome2cog25=genome2cog25) text = to_binary_shape({g: ['keep'] for g in retained_ids}, {"keep": { "color": "#88b719" }}) text = to_color_range({g: 'keep' for g in retained_ids}, {"keep": "#88b719"}) with open(join(dating_o, f'id_list/{phylum_name}_{gene}.txt'), 'w') as f1: f1.write(text) with open(join(dating_o, f'id_list/{phylum_name}_{gene}.list'), 'w') as f1: f1.write('\n'.join(retained_ids)) print(phylum_name, len(st.get_leaf_names()), len(retained_ids)) st.copy() st.prune(retained_ids) with open(join(dating_o, f'species_trees/{phylum_name}_{gene}.newick'), 'w') as f1: f1.write(st.write(format=9)) # draw target nodes LCA_nodes = [] for name in target_nodes: n = [n for n in st.traverse() if n.name == name][0] l1 = n.children[0].get_leaf_names()[0] l2 = n.children[1].get_leaf_names()[0] LCA_nodes.append(f"{l1}|{l2}") text = pie_chart({n: { 'speciation': 1 } for n in LCA_nodes}, {"speciation": "#ff0000"}, dataset_label='GeneRax results') with open(join(dating_o, f'target_nodes/{phylum_name}_{gene}.txt'), 'w') as f1: f1.write(text) # new set file set14 # set14_f = './dating/calibration_sets/scheme1/cal_set14.txt' c = 'GCA_000011385.1' n = [_ for _ in st.children if c not in _.get_leaf_names()][0] final_text = open(calbration_file).read().replace('GCA_002239005.1', n.get_leaf_names()[0]) with open(join(dating_o, f'calibrations/{phylum_name}_{gene}_set14.txt'), 'w') as f1: f1.write(final_text)
class ASRTree: #Attributes __tree = None #Actual tree __sim_tree = None #Simulation tree ____transition_prob_anad = None __transition_prob_aqp3 = None __sim_effect_sizes = [] #List containing simulation effect sizes __p_value_count = 0 #Number of times an effect size is simulated => actual __effect_size = 0 #Actual effect size of model __num_of_branches = __num_anad = __num_aqp3 = __num_anad_and_aqp3 = __num_taxa = __p_value = 0 __anadromy_lookup = dict( ) #Dictionary matching FASTA file names (key) to a list of taxa names and character states SCIENTIFIC_INDEX = 0 COMMON_INDEX = 1 ANAD_INDEX = 2 AQP3_INDEX = 3 EPSILON = 0.00000000000000000001 #Number being added to anadromy/aqp3 variables to avoid division by 0 in effect size #Public Methods #--------------------------constructor-------------------------------------- # Description: Constructs ASTree and sets default value for tree, and creates # the 2D list for transition rate matrix, setting initial # values to 0. #--------------------------------------------------------------------------- def __init__(self): self.__tree = None self.____transition_prob_anad = [[0.0 for x in range(2)] for y in range(2)] self.__transition_prob_aqp3 = [[0.0 for x in range(2)] for y in range(2)] #end constructor #-----------------------------build_tree------------------------------------ # Description: Builds phylogenetic tree from newick tree file in RAxML result. #--------------------------------------------------------------------------- def build_tree(self, path): rax_file = open(path, "r") if rax_file.mode == "r": contents = rax_file.read() self.__tree = Tree(contents) print("\nRAxML tree imported successully.") else: print( "\nRAxML tree failed to import successfully. Please check the file path and try again." ) #end build_tree #-----------------------run_max_parsimony----------------------------------- # Description: Calls private functions for Fitch's algorithm of maximum # parsimony. #--------------------------------------------------------------------------- def run_max_parsimony( self ): #Calls private functions for Fitch's algorithm of maximum parsimony if self.__tree is None: print( "\n****************Error****************\nTree has not been imported. Please run build_tree method first." ) else: self.__tree.resolve_polytomy( ) #Transform tree to bifurcating - does nothing if already bifurcating self.__down_pass() self.__up_pass() self.__clean_tree() self.__find_char_states() self.__find_transition_prob() self.__effect_size = self.calc_effect_size(self.__num_anad + self.EPSILON,\ self.__num_aqp3 + self.EPSILON, self.__num_anad_and_aqp3 + self.EPSILON) #end run_max_parsimony #-----------------------------get_num_taxa---------------------------------- # Description: Returns number of taxa. #--------------------------------------------------------------------------- def get_num_taxa(self): return self.__num_taxa #end get_num_taxa #-----------------------------get_p_value----------------------------------- # Description: Returns the P-Value of the hypothesis test. #--------------------------------------------------------------------------- def get_p_value(self): return self.__p_value #end get_p_value #--------------------------import_lookup------------------------------------ # Description: Imports the look-up file for assigning character state # changes and taxa names. #--------------------------------------------------------------------------- def import_lookup( self, path ): #Imports the look-up file for assigning character state changes and taxa names import_file = xlrd.open_workbook(path) file = import_file.sheet_by_index(0) values = list() #Local list for holding cell row information for row in range( 1, file.nrows): #Nested loops to cover entire spreadsheet for col in range( file.ncols ): #Creates a list of the scientific names, common names and character states for each fish in file if col == 0: file_name = file.cell_value(row, col) values.append(file_name) elif col == 1: scientific_name = file.cell_value(row, col) values.append(scientific_name) elif col == 2: common_name = file.cell_value(row, col) values.append(common_name) elif col == 3: anadromous = int(file.cell_value(row, col)) values.append(anadromous) else: aqp3 = int(file.cell_value(row, col)) values.append(aqp3) self.__anadromy_lookup[values[0]] = values[1:] values.clear() __num_taxa = len(self.__anadromy_lookup) #end import_lookup #----------------------------show_tree-------------------------------------- # Description: Displays tree in console and opens an external window to # interact with tree and see branch length. #--------------------------------------------------------------------------- def show_tree(self): print( self.__tree.get_ascii(attributes=["name", "anadromy", "aqp3"], show_internal=True)) self.__tree.show() #end show_tree #----------------------------to_string-------------------------------------- # Description: Prints to console number of taxa and their names, as well as # the number of character state changes. #--------------------------------------------------------------------------- def to_string(self): if self.__tree == None or self.__effect_size == 0: return "\n****************Error****************\nTree not constructed,\ or maximum parsimony not yet run. Please run methods and try again." count = 0 asr_info = "\n\t\tTaxa\n" for key in self.__anadromy_lookup: count += 1 asr_info += str(count) + ": " + self.__anadromy_lookup[key][ self.SCIENTIFIC_INDEX] asr_info += " (" + self.__anadromy_lookup[key][ self.COMMON_INDEX] + ")\n" asr_info += "\nAnadromy Character State Changes: " + str( self.__num_anad) asr_info += "\nAQP3 Character State Changes: " + str(self.__num_aqp3) return asr_info #end to_string #------------------------calc_effect_size----------------------------------- # Description: Public method that calculates the effect size of the ASRTree. #--------------------------------------------------------------------------- def calc_effect_size(self, numOfAnad, numOfAqp3, numAnadAndAqp3): effect_size = ((numAnadAndAqp3 / self.__num_of_branches) / ((numOfAnad / self.__num_of_branches) * (numOfAqp3 / self.__num_of_branches))) return effect_size #end calc_effect_size #-------------------------monte_carlo_sim----------------------------------- # Description: Public method to run n number of Monte Carlo simulations # in order to test the hypothesis. Each simulation checks # the ancestral node in the tree, then refers to the transition # rate matrix for the probability of getting the same or a # different character state. #--------------------------------------------------------------------------- def monte_carlo_sim(self, num_sims): #Checks if there already is a simulation tree to avoid unncessary copies self.__p_value_count = 0 #Initialize back to 0 self.__sim_effect_sizes.clear() #Initialize back to empty if self.__sim_tree is None: self.__sim_tree = self.__tree.copy() for sim in range(num_sims): #Set values of each count back to the EPSILON value to avoid #division by 0 in the effect size aqp3_count = self.EPSILON anad_count = self.EPSILON anad_aqp3_count = self.EPSILON for node in self.__sim_tree.traverse("preorder"): rand_num_1 = random.randint(0, 1001) rand_num_2 = random.randint(0, 1001) if not node.is_root(): #Check each ancestor's character state, and roll a random #number against the probability of going from that state to #the same or a different state based on transition matrix #and assign that character state. Tally all gains if node.up.anadromy == 1: if (self.____transition_prob_anad[1][0] * 1000) > rand_num_1: node.add_feature("anadromy", 0) else: node.add_feature("anadromy", 1) anad_count += 1 else: if (self.____transition_prob_anad[0][1] * 1000) < rand_num_1: node.add_feature("anadromy", 0) else: node.add_feature("anadromy", 1) anad_count += 1 if node.up.aqp3 == 1: if (self.__transition_prob_aqp3[1][0] * 1000) > rand_num_2: node.add_feature("aqp3", 0) else: node.add_feature("aqp3", 1) aqp3_count += 1 else: if (self.__transition_prob_aqp3[0][1] * 1000) < rand_num_2: node.add_feature("aqp3", 0) else: node.add_feature("aqp3", 1) aqp3_count += 1 if node.anadromy == 1 and node.aqp3 == 1: anad_aqp3_count += 1 #Calculate the effect size and store the results. eff_size = self.calc_effect_size(anad_count, aqp3_count, anad_aqp3_count) self.__sim_effect_sizes.append(eff_size) if eff_size >= self.__effect_size: self.__p_value_count += 1 self.__p_value = (self.__p_value_count / num_sims ) #Calculate and store p-value #end monte_carlo_sim #--------------------------plot_histogram----------------------------------- # Description: Public method to plot the histogram for testing the null # hypothesis. #--------------------------------------------------------------------------- def plot_histogram(self): plt.style.use('seaborn') _ = plt.hist(self.__sim_effect_sizes, bins=100) plt.axvline(self.__effect_size, color='k', linestyle='dashed', linewidth=1) plt.text(self.__effect_size + .05, 200, ' Actual Effect Size:{:.3f}'.format(self.__effect_size)) plt.xlabel('Effect Size') plt.ylabel('Effect Frequency') plt.title('Monte Carlo Simulation Distribution') plt.show() #end plot_histogram #--------------------__find_transition_prob--------------------------------- # Description: Private method that determines the transition probability # of each character trait change. #--------------------------------------------------------------------------- def __find_transition_prob(self): #Establish counter variables and traverse tree zero_to_one_anad = zero_to_zero_anad = one_to_zero_anad = one_to_one_anad = 0.0 zero_to_one_aqp3 = zero_to_zero_aqp3 = one_to_zero_aqp3 = one_to_one_aqp3 = 0.0 for node in self.__tree.traverse("postorder"): if not node.is_root(): #Find Anadromy transitions if (node.up.anadromy is 0 and node.anadromy is 0): zero_to_zero_anad += 1 elif (node.up.anadromy is 0 and node.anadromy is 1): zero_to_one_anad += 1 elif (node.up.anadromy is 1 and node.anadromy is 0): one_to_zero_anad += 1 else: one_to_one_anad += 1 #Find AQP3 transitions if (node.up.aqp3 is 0 and node.aqp3 is 0): zero_to_zero_aqp3 += 1 elif (node.up.aqp3 is 0 and node.aqp3 is 1): zero_to_one_aqp3 += 1 elif (node.up.aqp3 is 1 and node.aqp3 is 0): one_to_zero_aqp3 += 1 else: one_to_one_aqp3 += 1 #Insert the probability into the appropriate matrix self.____transition_prob_anad[0][0] = (zero_to_zero_anad / self.__num_of_branches) self.____transition_prob_anad[0][1] = (zero_to_one_anad / self.__num_of_branches) self.____transition_prob_anad[1][1] = (one_to_one_anad / self.__num_of_branches) self.____transition_prob_anad[1][0] = (one_to_zero_anad / self.__num_of_branches) self.__transition_prob_aqp3[0][0] = (zero_to_zero_aqp3 / self.__num_of_branches) self.__transition_prob_aqp3[0][1] = (zero_to_one_aqp3 / self.__num_of_branches) self.__transition_prob_aqp3[1][1] = (one_to_one_aqp3 / self.__num_of_branches) self.__transition_prob_aqp3[1][0] = (one_to_zero_aqp3 / self.__num_of_branches) #end findTransitionProb #Private Methods #---------------------------__down_pass------------------------------------- # Description: Private method to perform down-pass to assign character state # to tips and internal nodes. #--------------------------------------------------------------------------- def __down_pass(self): for node in self.__tree.traverse("postorder"): #Check for internal nodes that have been visted - marked as "Ancestor" if node.name is "Ancestor": if not node.is_root(): #If the parent node of the current ancestor node is unvisited, #attach the character state of this node to its ancestor if node.up.name is "": node.up.add_feature("anadromy", node.anadromy) node.up.add_feature("aqp3", node.aqp3) node.up.name = "Ancestor" #If the node has an intersection with its ancestor, set it if node.aqp3.issubset( node.up.aqp3) or node.aqp3.issuperset( node.up.aqp3): node.up.add_feature( "aqp3", node.up.aqp3.intersection(node.aqp3)) else: #Otherwise, it's a union of two states node.up.add_feature("aqp3", node.up.aqp3.union(node.aqp3)) #If the node has an intersection with its ancestor, set it if node.anadromy.issubset( node.up.anadromy) or node.anadromy.issuperset( node.up.anadromy): node.up.add_feature( "anadromy", node.up.anadromy.intersection(node.anadromy)) else: #Otherwise, it's a union of two states node.up.add_feature( "anadromy", node.up.anadromy.union(node.anadromy)) else: #Otherwise, it could be an unnamed internal node, or a terminal node #If it's a terminal node, grab its states from the lookup if node.name in self.__anadromy_lookup: isAnadromous = set( [self.__anadromy_lookup[node.name][self.ANAD_INDEX]]) isAqp3 = set( [self.__anadromy_lookup[node.name][self.AQP3_INDEX]]) node.add_feature("anadromy", isAnadromous) node.add_feature("aqp3", isAqp3) if node.up.name is "": #If the internal node is not yet named, it is unvisited node.up.add_feature("anadromy", isAnadromous) node.up.add_feature("aqp3", isAqp3) node.up.name = "Ancestor" #Tag internal nodes as Ancestor to easily identify visited nodes if self.__anadromy_lookup[node.name][ self.AQP3_INDEX] in node.up.aqp3: node.up.add_feature( "aqp3", node.aqp3.intersection(node.up.aqp3)) else: node.up.add_feature("aqp3", node.up.aqp3.union(node.aqp3)) if self.__anadromy_lookup[node.name][ self.ANAD_INDEX] in node.up.anadromy: node.up.add_feature( "anadromy", node.anadromy.intersection(node.up.anadromy)) else: node.up.add_feature( "anadromy", node.up.anadromy.union(node.anadromy)) node.name = self.__anadromy_lookup[node.name][ self.COMMON_INDEX] #end __down_pass #----------------------------__up_pass-------------------------------------- # Description: Private method to perform up-pass to clear any union in # ancestor nodes by sinding the intersection of the # ancestor and its parent node. #--------------------------------------------------------------------------- def __up_pass(self): #Up-pass to clear any union in ancestor nodes for node in self.__tree.traverse("preorder"): if node.name is "Ancestor": if not node.is_root(): if len(node.anadromy) > 1: node.add_feature( "anadromy", node.anadromy.intersection(node.up.anadromy)) if len(node.aqp3) > 1: node.add_feature("aqp3", node.aqp3.intersection(node.up.aqp3)) #end __up_pass #--------------------------__clean_tree------------------------------------- # Description: Private function to clear the sets in the attributes for # anadromy and AQP3 in each node and turn them into integers. #--------------------------------------------------------------------------- def __clean_tree(self): for node in self.__tree.traverse("preorder"): character_state_anad = next(iter(node.anadromy)) character_state_aqp3 = next(iter(node.aqp3)) node.add_feature("anadromy", character_state_anad) node.add_feature("aqp3", character_state_aqp3) #end __clean_tree #-------------------------__find_char_states--------------------------------- # Description: Private function to find the number of branches, as well as # find the number of character states - both individual and # branches with both andromy and AQP3. #--------------------------------------------------------------------------- def __find_char_states(self): for node in self.__tree.traverse("preorder"): self.__num_of_branches += 1 if node.anadromy == 1 and node.aqp3 == 1: self.__num_anad_and_aqp3 += 1 if node.anadromy == 1: self.__num_anad += 1 if node.aqp3 == 1: self.__num_aqp3 += 1 self.__num_of_branches -= 1 #Not counting the root as a separate branch
all_dists = [] for othersp_seq in seqids_of_other_species: dist = (t & seqid).get_distance(othersp_seq, topology_only=True) all_dists.append(dist) # find indexes of the three shortest distances try: idxes_of_3_smallest = np.argpartition(np.array(all_dists), 3)[:3] except ValueError: idxes_of_3_smallest = np.argpartition( np.array(all_dists), 2) # for the case that list is only 3 items long closest_seq_ids = [seqid] for d in idxes_of_3_smallest: closest_seq_ids.append(seqids_of_other_species[d]) # ete3 has codeml handling implemented!! No need for own functions. subtree = t.copy() subtree.prune(closest_seq_ids, preserve_branch_length=True) subtree.unroot() evotree = EvolTree(subtree.write()) subfasta = make_clean_fasta(closest_seq_ids, seqdatadict) if not subfasta: omega_list.append("NA") continue else: evotree.link_to_alignment(subfasta) workdirname = './codeml_' + "__".join(closest_seq_ids) evotree.workdir = workdirname list_of_tempdirs.append(workdirname) # mark the foreground branch foreground_leafnode = evotree & seqid # print (seqid)
# GET 4 RANDOM INDICES TO PRUNE indices = sample(range(0, len(leaves)), 4) print "\nRANDOM 4 INDICES: " + ', '.join(str(x) for x in indices) # USE THOSE INDICES TO GET 4 RANDOM NODES to_prune = [] for index in indices: to_prune.append(leaves[index]) print "\nTO PRUNE " print to_prune print "\n" # COPY THE TREE TO NOT LOSE DATA c = t.copy(); # PRUNE THE TREE c.prune(to_prune) print c # END RESULT # Old tree still stored in "t" # Pruned tree stored in "c" # Reads in a file with a tree structure and returns a ete tree object
true_tree = Tree(mstree.newick(node_labels=labels)) RAX_MIN_BL = 1e-6 #### convert branch lengths to # expected substitutions for node in true_tree.traverse("postorder"): node.dist = node.dist * mutation_rate # clip the min branch length to make it workable with raxml-ng node.dist = max(RAX_MIN_BL, node.dist) #### print the true_tree as newick with open(os.path.join(out_dir, "true_tree.newick"), "w+") as f: f.write(true_tree.write(format=5, dist_formatter="%.12f")) # prep the copy to work on ref_tree = true_tree.copy() #### randomly select one individual per population for the reference set, # add the rest for the query set ref_map = defaultdict(list) qry_map = defaultdict(list) for k, v in pop_species_map.items(): ref = rd.randint(len(v), size=1)[0] for i in range(len(v)): if i == ref: ref_map[k].append(v[i]) else: qry_map[k].append(v[i]) # already prune out the query taxa ref_list = [v[0] for k, v in ref_map.items()]
def main(S,G,number_of_leaves,path,k,running_time,number_of_planted_vertices): global random_for_precentage,all_edges,TH_edges_in_subtree,compare_subtrees,TH_pattern_in_subtree,TH_compare_subtrees,both,TH_both,accur starting_time = datetime.now() new_G = nx.DiGraph() noise = 0 number_of_HT_under_planted = 10 S = Tree() sigma = {} nCr_lookup_table = {} fact_lookup_table = {} colors = {} S_dis_matrix = {} names = [] S_colors = {} G_internal_colors = {} sol = {} for i in range(0, number_of_leaves): names.append(sym + str(i)) S.populate(number_of_leaves, names_library=names) count_nodes_and_update_internal_names(S) #S = random_again(S, number_of_leaves / 4) colors = random_colors(S, colors) G = S.copy("newick") for leaf in G.iter_leaves(): if leaf.name[:6] == 'Specie': leaf.name = "Gene" + leaf.name[6:] else: leaf.name = "GeneI" + leaf.name[8:] print_tree(G,'G',path) print_tree(S,'S',path) sigma = create_sigme(number_of_leaves, sigma) utils.newick2edgelist.main(path) save_edgelist(S_dis_matrix,path) S = tr.Tree.get_from_path(path + "/phyliptree(binary,all).phy", schema="newick") G = tr.Tree.get_from_path(path + "/GeneTree(binary)_local.txt", schema="newick") S = utiles.init_internal_labels(S, 'x', sigma, path) G = utiles.init_internal_labels(G, 'u', sigma, path) G = tree_operations.collapse_edges(G) S = tree_operations.collapse_edges(S) S_labels_table, G_labels_table,sigma = inits.init_taxon_to_label_table(S, G, sigma) sigma, old_sigma = inits.update_sigma(S, G, 0, sigma, False, path, True, S_labels_table, G_labels_table) colors, old_colors = inits.update_colors(S, colors, True) max_dis = tree_operations.max_dis(S_dis_matrix) flag = True j = 0 all_random_sources_red_to_red = [] all_random_sources_black_to_black = [] all_random_nutral = [] all_random_sources = (all_random_sources_red_to_red, all_random_sources_black_to_black, all_random_nutral) new_G = tree_operations.weight_G_based_on_same_color_HT(G, new_G, [], [],[],[], 0, False, 'HT', False, k) new_G = tree_operations.number_of_edges_in_subtree(new_G) S_colors = tree_operations.color_tree(S, 'S', S_colors, colors, sigma) G_internal_colors = tree_operations.color_tree(G, 'G', G_internal_colors, colors, sigma) if not on_lab: draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, None, '_rand_before') if not running_time: while j < number_of_planted_vertices: print( ' ***** %sth vertex ******' % str( j)) sol[j] = {} nCr_lookup_table, fact_lookup_table, ( sol[j]['Marked'], sol[j]['list_of_couples']), colors = choose_planted_vertex(S_dis_matrix,new_G, S, G, G_internal_colors, TH_edges_in_subtree, compare_subtrees, TH_compare_subtrees, sigma, k, both, TH_both, j, sol, accur, nCr_lookup_table, fact_lookup_table, all_random_sources, colors, S_colors, max_dis) if sol[j]['Marked'] == False: flag = flag and sol[j]['Marked'] else: sigma, old_sigma, y = change_sigma(sigma, old_sigma, S, G, sol[j]['list_of_couples'], number_of_HT_under_planted,S_labels_table,G_labels_table) S_colors = tree_operations.color_tree(S, 'S', S_colors, colors, sigma) G_internal_colors = tree_operations.color_tree(G, 'G', G_internal_colors, colors, sigma) j += 1 if not flag: if not on_lab: draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, None, '_rand') old_colors = return_color_to_taxon(S, colors) save_data(old_sigma, old_colors, sol, noise, 0, compare,path) if not running_time: quit() print('Planted vertices:%s' % str(sol)) if not on_lab: draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, sol, '_rand' + str(noise) + '.' + str(0)) old_colors = return_color_to_taxon(S, colors) save_data(old_sigma, old_colors, sol, noise, 0,compare,path) return_planted_nodes_new_name(sol,G,path) p = Pool(15) parameters = [(noise_level[i],number_of_HT_under_planted,G_internal_colors,S_colors,nCr_lookup_table,fact_lookup_table,number_of_leaves) for i in range(0,len(noise_level))] p.map(create_tree_for_HT_and_colors_noise, parameters) p.map(create_tree_for_color_noise, parameters) p.map(create_tree_for_HT_noise, parameters) else: save_data(old_sigma, old_colors, sol, noise, 0, compare, path) print('Running time: %s' % str(datetime.now() - starting_time))
def main(arg1, arg2): start_time = time.time() with open(arg1) as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] ###print(content) if (arg2 == "common"): leaf_lists = {} for i in range(len(content)): t1 = Tree(content[i]) leaf_lists[i] = [] for leaf in t1: leaf_lists[i].append(leaf.name) ##print(leaf_lists) distance_mat = [] for x in range(0, len(content)): distance_mat.append([]) for y in range(0, x): lev_dist = len(intersection(leaf_lists[x], leaf_lists[y])) distance_mat[x].append(lev_dist) #.pop(0) M_labels = number_labels(0, len(content)) tree, order = UPGMA_inc.UPGMA(distance_mat, M_labels) ##print(tree) tree = tree + ';' t_order = Tree(tree) order_list = [] for node in t_order.traverse("postorder"): # Do some analysis on node if node.is_leaf(): order_list.append(node.name) ###print(t_order) ###print(order_list) #min_x=distance_mat.index(min(distance_mat)) #min_y=distance_mat[min_x].index(min(distance_mat[min_x])) ###print(min_x,min_y) ###print(distance_mat[min_x][min_y]) t2 = Tree(content[int(order_list[0])]) for i in range(0, len(order_list) - 1): t1 = Tree(content[int(order_list[i + 1])]) tree1_copy = t1.copy() t2 = Tree(scm(t1, t2)) #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy()) ###print("splits 1: ", splits1) ###print("splits 2: ",splits2) ###print(t2.write(format=9)) #t2.show() elif (arg2 == "uncommon"): leaf_lists = {} for i in range(len(content)): t1 = Tree(content[i]) leaf_lists[i] = [] for leaf in t1: leaf_lists[i].append(leaf.name) ###print(leaf_lists) distance_mat = [] for x in range(0, len(content)): distance_mat.append([]) for y in range(0, x): lev_dist = get_unique(leaf_lists[x], leaf_lists[y]) distance_mat[x].append(lev_dist) #.pop(0) M_labels = number_labels(0, len(content)) tree, order = UPGMA.UPGMA(distance_mat, M_labels) ###print(tree) tree = tree + ';' t_order = Tree(tree) order_list = [] for node in t_order.traverse("postorder"): # Do some analysis on node if node.is_leaf(): order_list.append(node.name) ###print(t_order) ##print(order_list) #min_x=distance_mat.index(min(distance_mat)) #min_y=distance_mat[min_x].index(min(distance_mat[min_x])) ###print(min_x,min_y) ###print(distance_mat[min_x][min_y]) t2 = Tree(content[int(order_list[0])]) for i in range(0, len(order_list) - 1): t1 = Tree(content[int(order_list[i + 1])]) tree1_copy = t1.copy() t2 = Tree(scm(t1, t2)) leaf_list1 = [] leaf_list2 = [] for leaf in t1: leaf_list1.append(leaf.name) for leaf in t2: leaf_list2.append(leaf.name) ###print(leaf_list1) overlap = intersection(leaf_list1, leaf_list2) ###print("overlap is: ",overlap) tree2_copy = t2.copy() ###print(tree1_copy,tree2_copy) tree1_copy.prune(overlap) tree2_copy.prune(overlap) #t.write(format=1 splits2 = rf_dist_list.main(tree2_copy.copy(), tree1_copy.copy()) #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy()) ###print("splits 1: ", splits1) ###print("splits 2: ",splits2) else: t2 = Tree(content[0]) for i in range(0, len(content) - 1): t1 = Tree(content[i + 1]) tree1_copy = t1.copy() t2 = Tree(scm(t1, t2)) leaf_list1 = [] leaf_list2 = [] for leaf in t1: leaf_list1.append(leaf.name) for leaf in t2: leaf_list2.append(leaf.name) ###print(leaf_list1) #overlap=intersection(leaf_list1,leaf_list2) ###print("overlap is: ",overlap) #tree2_copy=t2.copy() ###print(tree1_copy,tree2_copy) #tree1_copy.prune(overlap) #tree2_copy.prune(overlap) #t.write(format=1 #splits2=rf_dist_list.main(tree2_copy.copy(),tree1_copy.copy()) #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy()) ###print("splits 1: ", splits1) ###print("splits 2: ",splits2) ###print(time.time()-start_time) ###print(t2.write(format=9)) #t2.show() return t2
def train_placement_distances(rank_training_seqs: dict, taxonomic_ranks: dict, ref_fasta_dict: dict, test_fasta: FASTA, ref_pkg: ReferencePackage, leaf_taxa_map: dict, molecule: str, executables: dict, raxml_threads=4): """ Function for iteratively performing leave-one-out analysis for every taxonomic lineage represented in the tree, yielding an estimate of placement distances corresponding to taxonomic ranks. :param rank_training_seqs: A dictionary storing the sequence names being used to test each taxon within each rank :param taxonomic_ranks: A dictionary mapping rank names (e.g. Phylum) to rank depth values where Kingdom is 0, Phylum is 1, etc. :param ref_fasta_dict: A dictionary with headers as keys and sequences as values containing only reference sequences :param test_fasta: Dictionary with headers as keys and sequences as values for deduplicated training sequences :param ref_pkg: A ReferencePackage instance :param leaf_taxa_map: A dictionary mapping TreeSAPP numeric sequence identifiers to taxonomic lineages :param executables: A dictionary mapping software to a path of their respective executable :param molecule: Molecule type [prot | dna | rrna] :param raxml_threads: Number of threads to be used by RAxML for parallel computation :return: """ logging.info( "\nEstimating branch-length placement distances for taxonomic ranks. Progress:\n" ) taxonomic_placement_distances = dict() taxonomy_filtered_query_seqs = dict() pruned_ref_fasta_dict = dict() query_seq_name_map = dict() seq_dict = dict() pqueries = list() intermediate_files = list() aligner = "hmmalign" temp_tree_file = "tmp_tree.txt" temp_ref_aln_prefix = "taxonomy_filtered_ref_seqs" temp_query_fasta_file = "queries.fasta" query_multiple_alignment = aligner + "_queries_aligned.phy" # Read the tree as ete3 Tree instance ref_tree = Tree(ref_pkg.tree) bmge_file = executables["BMGE.jar"] if not os.path.exists(bmge_file): raise FileNotFoundError("Cannot find " + bmge_file) num_training_queries = 0 for rank in rank_training_seqs: num_rank_training_seqs = 0 for taxonomy in rank_training_seqs[rank]: num_rank_training_seqs += len(rank_training_seqs[rank][taxonomy]) if len(rank_training_seqs[rank]) == 0: logging.error("No sequences available for estimating " + rank + "-level placement distances.\n") return taxonomic_placement_distances, pqueries else: logging.debug( str(num_rank_training_seqs) + " sequences to train " + rank + "-level placement distances\n") num_training_queries += num_rank_training_seqs if num_training_queries < 30: logging.error("Too few (" + str(num_training_queries) + ") sequences for training placement distance model.\n") return taxonomic_placement_distances, pqueries if num_training_queries < 50: logging.warning("Only " + str(num_training_queries) + " sequences for training placement distance model.\n") step_proportion = setup_progress_bar(num_training_queries) acc = 0.0 # For each rank from Class to Species (Kingdom & Phylum-level classifications to be inferred by LCA): for rank in sorted(rank_training_seqs, reverse=True): if rank not in taxonomic_ranks: logging.error("Rank '" + rank + "' not found in ranks being used for training.\n") sys.exit(33) taxonomic_placement_distances[rank] = list() leaf_trimmed_taxa_map = trim_lineages_to_rank(leaf_taxa_map, rank) # Add the lineages to the Tree instance for leaf in ref_tree: leaf.add_features( lineage=leaf_trimmed_taxa_map.get(leaf.name, "none")) # Remove all sequences belonging to a taxonomic rank from tree and reference alignment for taxonomy in sorted(rank_training_seqs[rank]): logging.debug("Testing placements for " + taxonomy + ":\n") query_name = re.sub(r"([ /])", '_', taxonomy.split("; ")[-1]) leaves_excluded = 0 # Write query FASTA containing sequences belonging to `taxonomy` query_seq_decrementor = -1 for seq_name in rank_training_seqs[rank][taxonomy]: query_seq_name_map[query_seq_decrementor] = seq_name taxonomy_filtered_query_seqs[str( query_seq_decrementor)] = test_fasta.fasta_dict[seq_name] query_seq_decrementor -= 1 logging.debug("\t" + str(len(taxonomy_filtered_query_seqs.keys())) + " query sequences.\n") acc += len(taxonomy_filtered_query_seqs.keys()) write_new_fasta(taxonomy_filtered_query_seqs, fasta_name=temp_query_fasta_file) intermediate_files.append(temp_query_fasta_file) for key in ref_fasta_dict.keys(): node = key.split('_')[0] # Node with truncated and/or unclassified lineages are not in `leaf_trimmed_taxa_map` if node in leaf_trimmed_taxa_map and not re.match( taxonomy, leaf_trimmed_taxa_map[node]): pruned_ref_fasta_dict[node] = ref_fasta_dict[key] else: leaves_excluded += 1 unique_ref_headers = set([ re.sub('_' + re.escape(ref_pkg.prefix), '', x) for x in pruned_ref_fasta_dict.keys() ]) logging.debug("\t" + str(leaves_excluded) + " sequences pruned from tree.\n") # Copy the tree since we are removing leaves of `taxonomy` and don't want this to be permanent tmp_tree = ref_tree.copy(method="deepcopy") # iteratively detaching the monophyletic clades generates a bad tree, so do it all at once tmp_tree.prune(pruned_ref_fasta_dict.keys(), preserve_branch_length=True) # Resolve any multifurcations tmp_tree.resolve_polytomy() logging.debug("\t" + str(len(tmp_tree.get_leaves())) + " leaves in pruned tree.\n") # Write the new reference tree with sequences from `taxonomy` removed tmp_tree.write(outfile=temp_tree_file, format=5) intermediate_files.append(temp_tree_file) ## # Run hmmalign, BMGE and RAxML to map sequences from the taxonomic rank onto the tree ## if aligner == "papara": temp_ref_phylip_file = temp_ref_aln_prefix + ".phy" # Write the reference MSA with sequences of `taxonomy` removed phy_dict = utilities.reformat_fasta_to_phy( pruned_ref_fasta_dict) utilities.write_phy_file(temp_ref_phylip_file, phy_dict) aln_stdout = wrapper.run_papara(executables["papara"], temp_tree_file, temp_ref_phylip_file, temp_query_fasta_file, "prot") intermediate_files.append(temp_ref_phylip_file) os.rename("papara_alignment.default", query_multiple_alignment) elif aligner == "hmmalign": temp_ref_fasta_file = temp_ref_aln_prefix + ".fasta" temp_ref_profile = temp_ref_aln_prefix + ".hmm" sto_file = re.sub("\.phy$", ".sto", query_multiple_alignment) # Write the pruned reference FASTA file write_new_fasta(pruned_ref_fasta_dict, temp_ref_fasta_file) # Build the HMM profile that doesn't include pruned reference sequences wrapper.build_hmm_profile(executables["hmmbuild"], temp_ref_fasta_file, temp_ref_profile) # Currently not supporting rRNA references (phylogenetic_rRNA) aln_stdout = wrapper.profile_aligner(executables, temp_ref_fasta_file, temp_ref_profile, temp_query_fasta_file, sto_file) # Reformat the Stockholm format created by cmalign or hmmalign to Phylip sto_dict = file_parsers.read_stockholm_to_dict(sto_file) for seq_name in sto_dict: try: int(seq_name.split('_')[0]) seq_dict[seq_name.split('_')[0]] = sto_dict[seq_name] except ValueError: seq_dict[seq_name] = sto_dict[seq_name] write_new_fasta(seq_dict, query_multiple_alignment) intermediate_files += [ temp_ref_fasta_file, temp_ref_profile, sto_file, query_multiple_alignment ] else: logging.error("Unrecognised alignment tool '" + aligner + "'. Exiting now.\n") sys.exit(33) logging.debug(str(aln_stdout) + "\n") trim_command, query_filtered_multiple_alignment = wrapper.get_msa_trim_command( executables, query_multiple_alignment, molecule) launch_write_command(trim_command) intermediate_files += glob(query_filtered_multiple_alignment + "*") # Ensure reference sequences haven't been removed msa_dict, failed_msa_files, summary_str = file_parsers.validate_alignment_trimming( [query_filtered_multiple_alignment], unique_ref_headers, True) nrow, ncolumn = file_parsers.multiple_alignment_dimensions( seq_dict=read_fasta_to_dict(query_filtered_multiple_alignment), mfa_file=query_filtered_multiple_alignment) logging.debug("Columns = " + str(ncolumn) + "\n") if query_filtered_multiple_alignment not in msa_dict.keys(): logging.debug( "Placements for '" + taxonomy + "' are being skipped after failing MSA validation.\n") for old_file in intermediate_files: os.remove(old_file) intermediate_files.clear() continue logging.debug("Number of sequences discarded: " + summary_str + "\n") # Run RAxML with the parameters specified raxml_files = wrapper.raxml_evolutionary_placement( executables["raxmlHPC"], temp_tree_file, query_filtered_multiple_alignment, ref_pkg.sub_model, "./", query_name, raxml_threads) # Parse the JPlace file to pull distal_length+pendant_length for each placement jplace_data = jplace_parser(raxml_files["jplace"]) placement_tree = jplace_data.tree node_map = map_internal_nodes_leaves(placement_tree) for pquery in jplace_data.placements: top_lwr = 0.1 top_placement = PQuery(taxonomy, rank) for name, info in pquery.items(): if name == 'p': for placement in info: # Only record the best placement's distance lwr = float(placement[2]) if lwr > top_lwr: top_lwr = lwr top_placement.inode = placement[0] top_placement.likelihood = placement[1] top_placement.lwr = lwr top_placement.distal = round( float(placement[3]), 6) top_placement.pendant = round( float(placement[4]), 6) leaf_children = node_map[int( top_placement.inode)] if len(leaf_children) > 1: # Reference tree with clade excluded parent = tmp_tree.get_common_ancestor( leaf_children) tip_distances = parent_to_tip_distances( parent, leaf_children) top_placement.mean_tip = round( float( sum(tip_distances) / len(tip_distances)), 6) elif name == 'n': top_placement.name = query_seq_name_map[int( info.pop())] else: logging.error("Unexpected variable in pquery keys: '" + name + "'\n") sys.exit(33) if top_placement.lwr >= 0.5: # The minimum likelihood weight ration a placement requires to be included pqueries.append(top_placement) taxonomic_placement_distances[rank].append( top_placement.total_distance()) # Remove intermediate files from the analysis of this taxon intermediate_files += list(raxml_files.values()) for old_file in intermediate_files: os.remove(old_file) # Clear collections taxonomy_filtered_query_seqs.clear() intermediate_files.clear() pruned_ref_fasta_dict.clear() seq_dict.clear() query_seq_name_map.clear() while acc > step_proportion: acc -= step_proportion sys.stdout.write('-') sys.stdout.flush() if len(taxonomic_placement_distances[rank]) == 0: logging.debug("No samples available for " + rank + ".\n") else: stats_string = "RANK: " + rank + "\n" stats_string += "\tSamples = " + str( len(taxonomic_placement_distances[rank])) + "\n" stats_string += "\tMedian = " + str( round(utilities.median(taxonomic_placement_distances[rank]), 4)) + "\n" stats_string += "\tMean = " + str( round( float(sum(taxonomic_placement_distances[rank])) / len(taxonomic_placement_distances[rank]), 4)) + "\n" logging.debug(stats_string) sys.stdout.write("-]\n") return taxonomic_placement_distances, pqueries