def newickToMatrix(newickFile, type): '''Take a newick file return a matrix of distances between leaf nodes. the distance will either be number of internal nodes between leaves or total branch length between leaves (depending on what is set as `type`''' with open(newickFile, 'r') as f: newickString = f.read() tree = Tree(newickString.replace(';', ' ') + ';', format=1) leafNodeNames = getLeafNodeNames(tree) distanceMatrix = [] for leafNode in leafNodeNames: distancesFromNode = [] for otherLeafNode in leafNodeNames: if leafNode == otherLeafNode: distance = 0 else: if type == 'topology': distance = int( tree.get_distance(leafNode, otherLeafNode, topology_only=True)) else: distance = tree.get_distance(leafNode, otherLeafNode) distancesFromNode.append(distance) distanceMatrix.append(distancesFromNode) return (distanceMatrix)
def parent_to_tip_distances(parent: Tree, children: Tree, estimate=False): """ Function utilizing ete3's tree object for calculating distances between a reference node (parent) and query nodes (children). The `estimate` flag will cause the parent's edge length to be included in the distance calculation. :param parent: A reference node Tree instance :param children: A list of query nodes, also Tree instances :param estimate: Boolean indicating whether these distances are to be used for estimating the edge length ranges :return: list() of all branch distances between the parent node and the tips """ branch_distances = list() # Calculate distance between parent and all descendants for child_node in children: if isinstance(child_node, Tree): distal_length = parent.get_distance(child_node.name) elif isinstance(child_node, str): distal_length = parent.get_distance(child_node) elif isinstance(child_node, int): distal_length = parent.get_distance(str(child_node)) else: logging.error("Cannot handle type '" + type(child_node) + "' for child.") raise AssertionError() if estimate: distal_length += parent.dist branch_distances.append(distal_length) return branch_distances
def calc_distances(treePath): ''' This function calculates the distance between all pairs of tips on a given tree. input: treePath: `str`, file path to tree output: `pandas dataframe`, dataframe summarizing the tree. Columns include the names of the two sequences, the identifier for the branch (`seq_id`), the two HA groups the sequences come from and the branch length. ''' df = {"sequence1": [], "sequence2": [], "seq_id": [], "distance": []} treeName = os.path.basename(treePath) with open(treePath) as f: # workaround for file I/O deprecation in `ete3` treeString = f.read() t = Tree(treeString) leaves = [leaf.name for leaf in t.iter_leaves()] for pair in itertools.combinations(leaves, 2): seqs = [pair[0], pair[1]] seqs.sort() df["sequence1"].append(seqs[0]) df["sequence2"].append(seqs[1]) df["seq_id"].append("{0}_{1}".format(seqs[0], seqs[1])) df["distance"].append(t.get_distance(seqs[0], seqs[1])) df = pd.DataFrame(df) return df
def get_anc_order(tree_file, ancestors, tips_to_root=False): """ Orders input ancestors with respect to their position in the species tree. Can be ordered from root to tips (default) or tips to root. Args: tree_file (str): Path to the input newick formatted tree. ancestors (list of str): list of ancestor names Returns: OrderedDict: ancestor names in the requested order (keys) and list of ancestors in the input list that are below it (values). """ tree = Tree(tree_file, format=1) tree.prune([i for i in tree.get_leaves()]) dist_to_root = {i: tree.get_distance(i) for i in ancestors} anc_order = sorted(dist_to_root, key=dist_to_root.get) if tips_to_root: anc_order = anc_order[::-1] anc_order_dict = OrderedDict() for anc in anc_order: anc_order_dict[anc] = [] anc_node = search_one_node(tree, anc) for anc2 in ancestors: if anc != anc2: if is_below(anc_node, anc2): anc_order_dict[anc].append(anc2) return anc_order_dict
def calc_distance_mat(target): fp = "{}/{}/cluster.phb".format(direc, target) tree = Tree(fp) strain_lst = get_strain_lst(target, full=True) size = len(strain_lst) distance_mat = -np.ones((size, size)) for i in range(size): for j in range(i + 1, size): distance_mat[i][j] = tree.get_distance(strain_lst[i], strain_lst[j]) distance_mat[j][i] = distance_mat[i][j] return distance_mat
def draw_ete3_tree(organism, snplist, tree_file_name, config, c): '''Draws a phylogenetic tree using ETE3 Keyword arguments: organism -- the organism of which to make a tree snplist -- a list of the SNP names, positions and state file_name -- the name of the out-file _tree.pdf will be added ''' newick = tree_to_newick(organism, config, c) tree = Tree(newick, format=1) tree_depth = int(tree.get_distance(tree.get_farthest_leaf()[0])) for n in tree.traverse(): # Nodes are set to red colour nstyle = NodeStyle() nstyle["fgcolor"] = "#BE0508" nstyle["size"] = 10 nstyle["vt_line_color"] = "#000000" nstyle["hz_line_color"] = "#000000" nstyle["vt_line_type"] = 0 nstyle["hz_line_type"] = 0 nstyle["vt_line_width"] = 2 nstyle["hz_line_width"] = 2 ## ['B.3', 'T', 'C', 'A'] for snp in snplist.keys(): if n.name == snp and snplist[snp] == 0: # If the SNP is missing due to a gap, make it grey nstyle["fgcolor"] = "#DDDDDD" nstyle["size"] = 10 nstyle["vt_line_color"] = "#DDDDDD" nstyle["hz_line_color"] = "#DDDDDD" nstyle["vt_line_type"] = 1 nstyle["hz_line_type"] = 1 elif n.name == snp and snplist[snp] == 1: nstyle["fgcolor"] = "#99FF66" nstyle["size"] = 15 nstyle["vt_line_color"] = "#000000" nstyle["hz_line_color"] = "#000000" nstyle["vt_line_type"] = 0 nstyle["hz_line_type"] = 0 n.set_style(nstyle) ts = TreeStyle() ts.show_leaf_name = False # Do not print(leaf names, they are added in layout) ts.show_scale = False # Do not show the scale ts.layout_fn = self.CanSNPer_tree_layout # Use the custom layout ts.optimal_scale_level = 'full' # Fully expand the branches of the tree if config["dev"]: print("#[DEV] Tree file: %s" % tree_file_name) tree.render(tree_file_name, tree_style=ts, width=tree_depth * 500)
def get_closest_leave(leaves_to_index: dict, prepostorder_leaves: list, tree: Tree, leave: str) -> Tuple[str, float, float]: index = leaves_to_index[leave] if index == 0: closest = prepostorder_leaves[index + 1] elif index == len(prepostorder_leaves) - 1: closest = prepostorder_leaves[index - 1] else: leave_1 = prepostorder_leaves[index - 1] leave_2 = prepostorder_leaves[index + 1] dist_1 = tree.get_distance(leave_1, leave) dist_2 = tree.get_distance(leave_2, leave) if dist_1 < dist_2: closest = leave_1 else: closest = leave_2 if closest != leave: dist = tree.get_distance(leave, closest) top_distance = tree.get_distance(leave, closest, True) else: logger.warning(f"Nearest neighbor to node {leave} is itself!") dist = 0 top_distance = 0 return closest, dist, top_distance
def newick_to_pairwise_nodes(newick_string): # we load a tree # ((((H,K)D,(F,I)G)B,E)A,((L,(N,Q)O)J,(P,S)M)C); # newick_string = newick_string + "i_root" t = Tree(newick_string, format=1) # t = t + "i_root" nodes = [] edges = [] dic_id = {} cont = 0 for node in t.traverse("preorder"): # Do some analysis on node if node.name == '': node.name = 'i_root' if node.name == 'NoName': node.name = "i_" + node.name + '_' + str(cont) nodes.append({"id": cont, "name": node.name}) else: nodes.append({"id": cont, "name": node.name}) dic_id[node.name] = cont cont = cont + 1 for node in t.traverse("preorder"): ancestor = "" # print (node.name) # print("antecesor") for anc in node.iter_ancestors(): if anc: ancestor = anc break if ancestor != "": # print(ancestor.name, ", ", node.name, format(t.get_distance(ancestor, node),"f")) edges.append({ "source": dic_id[ancestor.name], "target": dic_id[node.name], "edgeWidth": format(t.get_distance(ancestor, node), "f") }) json = {"nodes": nodes, "links": edges} return str(json).replace("'", '"')
def saturation(fafile, trfile): #compute pairwise %Id aln = AlignIO.read(open(fafile), 'fasta') calculator = DistanceCalculator('blosum62') dm = calculator.get_distance(aln) pwdists, lfpairs = [], [] for i, j in combinations(range(len(dm.names)), 2): lfpairs.append((dm.names[i], dm.names[j])) pwdists.append(dm[i][j]) #Compute patristic Distance from ML Tree t = Tree(open(trfile).readline()) padists = [t.get_distance(lf1, lf2) for lf1, lf2 in lfpairs] slope, intersect = np.polyfit(padists, pwdists, 1) return slope
def main(clusterFilepath, strainFilepath, phbFilepath, outFilepath): cluster_df = pd.read_csv(clusterFilepath, dtype="object") strain_lst = [s.strip() for s in open(strainFilepath, 'r').readlines()] t = Tree(phbFilepath) print("Calc distance matrix") distance_mat = -np.ones((len(strain_lst), len(strain_lst))) for i, node1 in enumerate(strain_lst): for j, node2 in enumerate(strain_lst): if i != j: distance_mat[i, j] = t.get_distance(node1, node2) pattern = r"([^()]+)(\([0-9]+\))?" r = re.compile(pattern) dct_lst = [] for _, row in cluster_df.iterrows(): if _ % 100 == 0: print(_) dct = {} dct["family"] = row["family"] msk = row[strain_lst].isnull() for sidx in range(len(strain_lst)): if msk[sidx]: x = np.ma.array(distance_mat[sidx], mask=msk) qidx = x.argmin() assert distance_mat[sidx, qidx] >= 0 #drop (num) notation from orfId orfId_lst = row[strain_lst[qidx]].split(' ') new_lst = [] for orfId in orfId_lst: new_lst.append(r.findall(orfId)[0][0]) dct[strain_lst[sidx]] = ' '.join(new_lst) dct_lst.append(dct) out_df = pd.DataFrame(dct_lst) out_df = out_df[["family"] + strain_lst] out_df.to_csv(outFilepath, index=False) print("OUTPUT to {}".format(outFilepath))
def get_branch_lens(): """ Parse input tree and retrieve branch lengths @return: """ tree = Tree('renamed.tre') dist_mat = [] for i, taxon1 in enumerate(taxa): dist_mat.append([]) node1 = tree & taxon1 for taxon2 in taxa: node2 = tree & taxon2 dist = tree.get_distance(node1, node2) dist_mat[i].append(dist) length = int((len(taxa) - 1) / 2) taxa_dict = {} for i, taxon in enumerate(taxa): bran_lens = sorted(dist_mat[i], reverse=True)[0:length] taxa_dict[taxon] = sum(bran_lens) / length series = pd.Series(taxa_dict) size = int(len(taxa) / 3) return series, size
import numpy as np from ete3 import Tree import ete3 if __name__ == "__main__": # use ete3's get_distance function to compute pairwise additive distances between leaves in tree tree = Tree("../../data/tree/tree.nw") list_nodes = list(tree.get_leaves()) print(len(list_nodes)) list_names = [] dmat = np.zeros((len(list_nodes), len(list_nodes))) for i in range(len(list_nodes)): list_names.append(list_nodes[i].name) for j in range(i, len(list_nodes)): d = tree.get_distance(list_nodes[i], list_nodes[j], topology_only=False) dmat[i, j] = dmat[j, i] = round(d, 5) dist_df = pd.DataFrame(data=dmat, index=list_names, columns=list_names) dist_df.to_csv("../../data/tree/tree_distancematrix.txt", sep='\t', header=True)
# | | | # | /--------| \-F # | | | # | /--------| \-G # | | | # \--------| \-H # | # \-E # # Locate some nodes A = t & "A" C = t & "C" # Calculate distance from current node print "The distance between A and C is", A.get_distance("C") # Calculate distance between two descendants of current node print "The distance between A and C is", t.get_distance("A", "C") # Calculate the toplogical distance (number of nodes in between) print "The number of nodes between A and D is ", t.get_distance("A", "D", topology_only=True) # Calculate the farthest node from E within the whole structure farthest, dist = (t & "E").get_farthest_node() print "The farthest node from E is", farthest.name, "with dist=", dist # Calculate the farthest node from E within the whole structure, # regarding the number of nodes in between as distance value # Note that the result is differnt. farthest, dist = (t & "E").get_farthest_node(topology_only=True) print "The farthest (topologically) node from E is", farthest.name, "with", dist, "nodes in between" # Calculate farthest node from an internal node farthest, dist = t.get_farthest_node() print "The farthest node from root is is", farthest.name, "with dist=", dist # # The program results in the following information:
k = 0 for s1 in spe_l: h = s1.name if "Cultervirus" in s1.name: h = h.split("_")[-1] if "Carbovirus" in s1.name: k += 1 h = h.split("_")[-1] + str(k) header.append(h) l_d = [] for s2 in spe_l: if s1 == s2: d = 0 l_d.append(d) else: d_d = t.get_distance(s1.name, s2.name) l_d.append(d_d) # extract minmum distance min_d = sorted(l_d)[1] spe_d_l.append(l_d) df_d = pd.DataFrame(spe_d_l, index=[str(i) for i in header], columns=[str(i) for i in header]) ## 5. Define genetic distance to separate different viral species df_species = pd.read_table(borna_species, sep='\t', names=("node", "name", "species")) borna_species_l = set([i.strip() for i in df_species.species.to_list()])
def create_original_tree(meta_information_list, avg_list, root, name, sci, save_labels): save_string = name + ".pdf" save_string_colorbar = name + "_colorbar.pdf" final_save = name + "_final" + ".pdf" if sci: nc_val = [x[1] for x in avg_list] else: nc_val = [round(x[1], 3) for x in avg_list] if not avg_list: avg_list = [[root, 1.0], ["V", 0.0]] mx = max([x[1] for x in avg_list]) mn = min([x[1] for x in avg_list]) if mn == mx: mn = mn - 0.001 mx = mx + 0.001 if sci: mx = math.log(mx) if mn == 0: mn = 0.000001 mn = math.log(mn) colorbar(mn, mx, save_labels, save_string_colorbar) leaf_val = [x[0] for x in avg_list] tree = dict() for virus_info in meta_information_list: for index in range(0, len(virus_info)): virus_info[index] = virus_info[index].replace(";", "").replace(":", "") if virus_info[index] not in tree.keys( ) and index + 1 < len(virus_info): tree[virus_info[index]] = {virus_info[index + 1]: 1} if index < len(virus_info) and index > 0: d1 = {virus_info[index]: 1} tree[virus_info[index - 1]].update(d1) else: if index != 0: d1 = {virus_info[index]: 1} tree[virus_info[index - 1]].update(d1) newick_tree, const_bool = newickify(tree, root_node=root) if const_bool: t = Tree(newick_tree, quoted_node_names=True, format=1) ts, t = set_default_TreeStyle(t, False) t.set_style(ts) count = 0 for name, value in zip(leaf_val, nc_val): matching_nodes = t.search_nodes(name=name) if matching_nodes: dst = t.get_distance(root, matching_nodes[0]) if sci: if value == 0: rgb_color = rgb2(mn, mx, math.log(0.000001)) rgb_color = ('#%02x%02x%02x' % rgb_color) complexity = TextFace(value, fgcolor=rgb_color, fsize=200, bold=True) change_tree_branch(matching_nodes, rgb_color, dst) else: rgb_color = rgb2(mn, mx, math.log(value)) rgb_color = ('#%02x%02x%02x' % rgb_color) complexity = TextFace("{:.2e}".format(value), fgcolor=rgb_color, fsize=200, bold=True) change_tree_branch(matching_nodes, rgb_color, dst) else: rgb_color = rgb2(mn, mx, value) rgb_color = ('#%02x%02x%02x' % rgb_color) complexity = TextFace(value, fgcolor=rgb_color, fsize=200, bold=True) change_tree_branch(matching_nodes, rgb_color, dst) virus_name = TextFace(matching_nodes[0].name, fgcolor=rgb_color, fsize=200, bold=True) # matching_nodes[0].add_face(face=complexity, column=1, position="branch-bottom") matching_nodes[0].add_face(face=virus_name, column=1, position="branch-top") else: print("ERROR->", avg_list[count]) count += 1 t.render(save_string, tree_style=ts, dpi=1000, h=120000, w=120000, units="px") return const_bool
#!/usr/bin/env python3 import sys import glob from ete3 import Tree proteins = ["CARD1", "DSRM1", "DSRM2", "DSRM3", "RD1"] for protein in proteins: fname = "../%s/brlens_and_labels.tre" % protein tre = Tree(fname, format=1) outf = open("../%s/dists_from_root.csv" % protein, "w") outf.write("Node,branchLength,DistFromRoot,NodesFromRoot\n") for node in tre.traverse("preorder"): id = node.name blen = node.dist dist_from_root = tre.get_distance(node) nodes_from_root = tre.get_distance(node, topology_only=True) outf.write("%s,%f,%f,%d\n" % (id, blen, dist_from_root, nodes_from_root)) outf.close()
class TreeDataset(GeneExpressionDataset): """Forms a ``GeneExpressionDataset`` with a corresponding Tree structure relatating every cell. This is the dataset class that will be used to interact with the TreeVAE model. It's important to observe here that this function does not take in expression data from a CSV or sparse matrix, for example, but rather assumes that an scVI GeneExpressionDataset has already been created. The resulting API of the dataset remains very similar to that of a typical GeneExpressionDataset but with the addition of a tree (of class `ete3.Tree`) that will be used as a prior during model fitting. :param expr: ``scvi.dataset.GeneExpressionDataset`` instance. :param tree: file path to tree to read in from ``ete3.Tree`` instance. """ def __init__( self, expr: GeneExpressionDataset, tree=None, filtering=True ): if tree is not None and type(tree) == str: self.tree = Tree(tree, 1) # polytomy is not a problem anymore: message passing deals with general trees # self.tree.resolve_polytomy(recursive=True) else: self.tree = tree if self.tree is None: logger.error( "Must provide a tree file path or a tree if you're using TreeDataset." ) # assert we have barcode labels for cells if "barcodes" not in expr.cell_attribute_names: logger.error("Must provide cell barcode, or names, as a cell attribute.") super().__init__() # set some tree attributes self.populate_treedataset(expr) # keeping the cells in the tree and Gene expression dataset (not needed for simulations) # self.filter_cells_by_tree() if filtering: self.filter_cells_by_count() def populate_treedataset(self, expr): """ Populate the TreeDataset with respect to an GeneExpressionDataset that is passed in. :param expr: A ``scvi.dataset.GeneExpressionDataset`` instance. """ # set distance for n in self.tree.traverse(): n.distance = self.tree.get_distance(n) self.populate_from_datasets([expr]) def populate(self): tree = self.tree if tree is None and self.tree is not None: self.tree = Tree(tree, 1) else: logger.error( "Must provide a tree file path or a tree if you're using TreeDataset." ) # set distance for n in self.tree.traverse(): n.distance = self.tree.get_distance(n) self.populate_from_datasets([expr]) self.populate_treedataset(expr=self) self.filter_cells_by_tree() self.filter_cells_by_count() def filter_cells_by_tree(self): """ Prunes away cells that don't appear consistently between the tree object and the RNA expression dataset. """ leaves = self.tree.get_leaf_names() keep_barcodes = np.intersect1d(leaves, self.barcodes) self.tree.prune(keep_barcodes) return self.filter_cells_by_attribute(keep_barcodes, on="barcodes")
#print "root phylum not monophyletic!" #print t.get_ascii(attributes=["name", "phylum"], show_internal=False) for pnd in t.search_nodes(phylum='Porifera'): try: t.set_outgroup(t & pnd) except: #print 'trying another root...' continue #Check phylum mononphyly Monophyletic = phylum_mono(t, TaxPhyl) mct = '/'.join( map(str, [ Counter(Monophyletic.values())[cat] for cat in [True, False, None] ])) #Calculate p-dists and reject taxa deviation from median by an order of magnitude pdists = dict((leaf.name, round(t.get_distance(leaf), 5)) for leaf in t) rejected, kept = reject_outliers(pdists) #print len(rejected),rejected,len(pdists) for tax in rejected['tax']: nbTaxReject[tax] += 1 #generate new alignments with rejected... filtered = [] fafile = '{0}/{1}.al.hc.tr.fa'.format(path, gid) for rec in SeqIO.parse(fafile, 'fasta'): if rec.id in set(kept['tax']): filtered.append(rec) SeqIO.write(filtered, '{0}/{1}.al.hc.tr.ft.fa'.format(path, gid), 'fasta') #calculate saturation satSlp = saturation(fafile, trfile)
text = t.write() with open(new_tree, 'w') as f1: f1.write(text) import pandas as pd from tqdm import tqdm from collections import defaultdict t = Tree(intree) # all_g = set([convert_genome_ID_rev(_.split('_')[0]) for _ in t.get_leaf_names()]) all_ids = t.get_leaf_names() id_dict = defaultdict(dict) for g1 in tqdm(t.get_leaves()): for g2 in t.get_leaves(): id_dict[g1.name][g2.name] = t.get_distance(g1, g2) dis = pd.DataFrame.from_dict(id_dict) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2, random_state=0, precompute_distances=True, tol=1e-10).fit(dis.values) kmeans.labels_ id2info = defaultdict(list) for idx, id in enumerate(dis.index): new_name = convert_genome_ID_rev(id.split('_')[0]) + '_' + id id2info[new_name] = [str(kmeans.labels_[idx])] from api_tools.itol_func import *
return 0.0 try: return matrix[(a, b)] except KeyError: return matrix[(b, a)] for tip_a, tip_b in itertools.permutations(lineages.keys(), 2): d = sum([n.dist for n in lineages[tip_a] ^ lineages[tip_b]]) matrix[(tip_a, tip_b)] = d #if len(matrix) % 10000 == 0: # print >>sys.stderr, len(matrix) leaves = t.get_leaf_names() print '\t'.join(['#names'] + leaves) for tip_a in leaves: row = [tip_a] for tip_b in leaves: row.append(get_dist(tip_a, tip_b)) print '\t'.join(map(str, row)) # test import random s = random.sample(matrix.keys(), 1000) for a, b in s: d0 = get_dist(a, b) d1 = t.get_distance(a, b) if round(d0, 8) != round(d1, 8): print >> sys.stderr, a, b, d0, d1
#!/usr/bin/env python3 from ete3 import Tree import sys with open(sys.argv[1], 'r') as treefile: nwk_string = '' for line in treefile: nwk_string += line.rstrip("\n") tree = Tree(nwk_string) tree.set_outgroup(tree.get_common_ancestor("CELEG", "CINOP")) print("CBRIG", "CREMA", tree.get_distance("CBRIG", "CREMA")) print("CNIGO", "CREMA", tree.get_distance("CNIGO", "CREMA")) print("CTROP", "CREMA", tree.get_distance("CTROP", "CREMA")) print("CWALL", "CREMA", tree.get_distance("CWALL", "CREMA")) print("CELEG", "CREMA", tree.get_distance("CELEG", "CREMA")) print("CINOP", "CREMA", tree.get_distance("CINOP", "CREMA"))
def random_tree(trees): ''' Randomly choose a tree and find two nodes for inheritance ''' #Randomly choose a tree while True: tree = choice(open(trees).readlines()) t = Tree(tree, format=1) tips = [] nodes = [] k = 1 for node in t.traverse(): if node.is_leaf(): tips.append(node.name) elif not node.is_root(): node.add_features(name='n' + str(k)) nodes.append(node.name) k += 1 nodes = list(filter(None, nodes)) #Randomly choose two nodes for inheritance timeout1 = time.time() + 60 timeout2 = time.time() + 90 while True: rn2 = Tree(tree, format=1) rn = sample(nodes, 2) rn1 = t.search_nodes(name=rn[0])[0] rn2 = t.search_nodes(name=rn[1])[0] if time.time() <= timeout1: if (len(rn1.get_leaves()) <= 2) or (len(rn2.get_leaves()) <= 2): continue elif rn2 in rn1.get_descendants(): continue elif rn1 in rn2.get_descendants(): continue elif rn2 in rn1.get_sisters(): continue else: r_tips = [] r_nodes = [] for node in rn1.traverse(): if node.is_leaf(): r_tips.append(node.name) else: r_nodes.append(node.name) root1 = t.get_common_ancestor(r_tips) root2 = [] for node in rn2.traverse(): if node.is_leaf(): r_tips.append(node.name) root2.append(node.name) else: r_nodes.append(node.name) root2 = t.get_common_ancestor(root2) dist = t.get_distance(root1, root2, topology_only=True) tree = topology_dist(t, nodes, r_nodes, r_tips, branchProbabilityDist) return [tree, nodes, tips, r_nodes, r_tips, dist] elif time.time() <= timeout2: if (len(rn1.get_leaves()) < 2) or (len(rn2.get_leaves()) < 2): continue elif rn2 in rn1.get_descendants(): continue elif rn1 in rn2.get_descendants(): continue elif rn2 in rn1.get_sisters(): continue else: r_tips = [] r_nodes = [] root1 = [] for node in rn1.traverse(): if node.is_leaf(): r_tips.append(node.name) root1.append(node.name) else: r_nodes.append(node.name) root1 = t.get_common_ancestor(root1) root2 = [] for node in rn2.traverse(): if node.is_leaf(): r_tips.append(node.name) root2.append(node.name) else: r_nodes.append(node.name) root2 = t.get_common_ancestor(root2) dist = t.get_distance(root1, root2, topology_only=True) tree = topology_dist(t, nodes, r_nodes, r_tips, branchProbabilityDist) return [tree, nodes, tips, r_nodes, r_tips, dist] else: break
def draw_tree(the_tree, colour, back_color, label, out_file, the_scale, extend, bootstrap, group_file, grid_options, the_table, pres_abs, circular): t = Tree(the_tree, quoted_node_names=True) # t.ladderize() font_size = 8 font_type = 'Heveltica' font_gap = 3 font_buffer = 10 o = t.get_midpoint_outgroup() t.set_outgroup(o) the_leaves = [] for leaves in t.iter_leaves(): the_leaves.append(leaves) groups = {} num = 0 # set cutoff value for clades as 1/20th of the distance between the furthest two branches # assign nodes to groups last_node = None ca_list = [] if not group_file is None: style = NodeStyle() style['size'] = 0 style["vt_line_color"] = '#000000' style["hz_line_color"] = '#000000' style["vt_line_width"] = 1 style["hz_line_width"] = 1 for n in t.traverse(): n.set_style(style) with open(group_file) as f: group_dict = {} for line in f: group_dict[line.split()[0]] = line.split()[1] for node in the_leaves: i = node.name for j in group_dict: if j in i: if group_dict[j] in groups: groups[group_dict[j]].append(i) else: groups[group_dict[j]] = [i] coloured_nodes = [] for i in groups: the_col = i style = NodeStyle() style['size'] = 0 style["vt_line_color"] = the_col style["hz_line_color"] = the_col style["vt_line_width"] = 2 style["hz_line_width"] = 2 if len(groups[i]) == 1: ca = t.search_nodes(name=groups[i][0])[0] ca.set_style(style) coloured_nodes.append(ca) else: ca = t.get_common_ancestor(groups[i]) ca.set_style(style) coloured_nodes.append(ca) tocolor = [] for j in ca.children: tocolor.append(j) while len(tocolor) > 0: x = tocolor.pop(0) coloured_nodes.append(x) x.set_style(style) for j in x.children: tocolor.append(j) ca_list.append((ca, the_col)) if back_color: # for each common ancestor node get it's closest common ancestor neighbour and find the common ancestor of those two nodes # colour the common ancestor then add it to the group - continue until only the root node is left while len(ca_list) > 1: distance = float('inf') for i, col1 in ca_list: for j, col2 in ca_list: if not i is j: parent = t.get_common_ancestor(i, j) getit = True the_dist = t.get_distance(i, j) if the_dist <= distance: distance = the_dist the_i = i the_j = j the_i_col = col1 the_j_col = col2 ca_list.remove((the_i, the_i_col)) ca_list.remove((the_j, the_j_col)) rgb1 = strtorgb(the_i_col) rgb2 = strtorgb(the_j_col) rgb3 = ((rgb1[0] + rgb2[0]) / 2, (rgb1[1] + rgb2[1]) / 2, (rgb1[2] + rgb2[2]) / 2) new_col = colorstr(rgb3) new_node = t.get_common_ancestor(the_i, the_j) the_col = new_col style = NodeStyle() style['size'] = 0 style["vt_line_color"] = the_col style["hz_line_color"] = the_col style["vt_line_width"] = 2 style["hz_line_width"] = 2 new_node.set_style(style) coloured_nodes.append(new_node) ca_list.append((new_node, new_col)) for j in new_node.children: tocolor.append(j) while len(tocolor) > 0: x = tocolor.pop(0) if not x in coloured_nodes: coloured_nodes.append(x) x.set_style(style) for j in x.children: tocolor.append(j) elif colour: distances = [] for node1 in the_leaves: for node2 in the_leaves: if node1 != node2: distances.append(t.get_distance(node1, node2)) distances.sort() clade_cutoff = distances[len(distances) / 4] for node in the_leaves: i = node.name if not last_node is None: if t.get_distance(node, last_node) <= clade_cutoff: groups[group_num].append(i) else: groups[num] = [num, i] group_num = num num += 1 else: groups[num] = [num, i] group_num = num num += 1 last_node = node for i in groups: num = groups[i][0] h = num * 360 / len(groups) the_col = hsl_to_str(h, 0.5, 0.5) style = NodeStyle() style['size'] = 0 style["vt_line_color"] = the_col style["hz_line_color"] = the_col style["vt_line_width"] = 2 style["hz_line_width"] = 2 if len(groups[i]) == 2: ca = t.search_nodes(name=groups[i][1])[0] ca.set_style(style) else: ca = t.get_common_ancestor(groups[i][1:]) ca.set_style(style) tocolor = [] for j in ca.children: tocolor.append(j) while len(tocolor) > 0: x = tocolor.pop(0) x.set_style(style) for j in x.children: tocolor.append(j) ca_list.append((ca, h)) # for each common ancestor node get it's closest common ancestor neighbour and find the common ancestor of those two nodes # colour the common ancestor then add it to the group - continue until only the root node is left while len(ca_list) > 1: distance = float('inf') got_one = False for i, col1 in ca_list: for j, col2 in ca_list: if not i is j: parent = t.get_common_ancestor(i, j) getit = True for children in parent.children: if children != i and children != j: getit = False break if getit: the_dist = t.get_distance(i, j) if the_dist <= distance: distance = the_dist the_i = i the_j = j the_i_col = col1 the_j_col = col2 got_one = True if not got_one: break ca_list.remove((the_i, the_i_col)) ca_list.remove((the_j, the_j_col)) new_col = (the_i_col + the_j_col) / 2 new_node = t.get_common_ancestor(the_i, the_j) the_col = hsl_to_str(new_col, 0.5, 0.3) style = NodeStyle() style['size'] = 0 style["vt_line_color"] = the_col style["hz_line_color"] = the_col style["vt_line_width"] = 2 style["hz_line_width"] = 2 new_node.set_style(style) ca_list.append((new_node, new_col)) # if you just want a black tree else: style = NodeStyle() style['size'] = 0 style["vt_line_color"] = '#000000' style["hz_line_color"] = '#000000' style["vt_line_width"] = 1 style["hz_line_width"] = 1 for n in t.traverse(): n.set_style(style) color_list = [(240, 163, 255), (0, 117, 220), (153, 63, 0), (76, 0, 92), (25, 25, 25), (0, 92, 49), (43, 206, 72), (255, 204, 153), (128, 128, 128), (148, 255, 181), (143, 124, 0), (157, 204, 0), (194, 0, 136), (0, 51, 128), (255, 164, 5), (255, 168, 187), (66, 102, 0), (255, 0, 16), (94, 241, 242), (0, 153, 143), (224, 255, 102), (116, 10, 255), (153, 0, 0), (255, 255, 128), (255, 255, 0), (255, 80, 5), (0, 0, 0), (50, 50, 50)] up_to_colour = {} ts = TreeStyle() column_list = [] width_dict = {} if not grid_options is None: colour_dict = {} type_dict = {} min_val_dict = {} max_val_dict = {} leaf_name_dict = {} header_count = 0 the_columns = {} if grid_options == 'auto': with open(the_table) as f: headers = f.readline().rstrip().split('\t')[1:] for i in headers: the_columns[i] = [i] type_dict[i] = 'colour' colour_dict[i] = {'empty': '#FFFFFF'} width_dict[i] = 20 up_to_colour[i] = 0 column_list.append(i) else: with open(grid_options) as g: for line in g: if line.startswith('H'): name, type, width = line.rstrip().split('\t')[1:] if name in the_columns: the_columns[name].append(name + '_' + str(header_count)) else: the_columns[name] = [ name + '_' + str(header_count) ] width = int(width) name = name + '_' + str(header_count) header_count += 1 colour_dict[name] = {'empty': '#FFFFFF'} type_dict[name] = type width_dict[name] = width column_list.append(name) up_to_colour[name] = 0 min_val_dict[name] = float('inf') max_val_dict[name] = 0 elif line.startswith('C'): c_name, c_col = line.rstrip().split('\t')[1:] if not c_col.startswith('#'): c_col = colorstr(map(int, c_col.split(','))) colour_dict[name][c_name] = c_col val_dict = {} with open(the_table) as f: headers = f.readline().rstrip().split('\t')[1:] column_no = {} for num, i in enumerate(headers): if i in the_columns: column_no[num] = i for line in f: name = line.split('\t')[0] leaf_name = None for n in t.traverse(): if n.is_leaf(): if name.split('.')[0] in n.name: leaf_name = n.name if leaf_name is None: continue else: leaf_name_dict[leaf_name] = name vals = line.rstrip().split('\t')[1:] if name in val_dict: sys.exit('Duplicate entry found in table.') else: val_dict[name] = {} for num, val in enumerate(vals): if num in column_no and val != '': for q in the_columns[column_no[num]]: column_name = q if type_dict[column_name] == 'colour': val_dict[name][column_name] = val if not val in colour_dict[column_name]: colour_dict[column_name][val] = colorstr( color_list[up_to_colour[column_name] % len(color_list)]) up_to_colour[column_name] += 1 elif type_dict[column_name] == 'text': val_dict[name][column_name] = val elif type_dict[column_name] == 'colour_scale_date': year, month, day = val.split('-') year, month, day = int(year), int(month), int( day) the_val = datetime.datetime( year, month, day, 0, 0, 0) - datetime.datetime( 1970, 1, 1, 0, 0, 0) val_dict[name][ column_name] = the_val.total_seconds() if the_val.total_seconds( ) < min_val_dict[column_name]: min_val_dict[ column_name] = the_val.total_seconds() if the_val.total_seconds( ) > max_val_dict[column_name]: max_val_dict[ column_name] = the_val.total_seconds() elif type_dict[column_name] == 'colour_scale': the_val = float(val) val_dict[name][column_name] = the_val if the_val < min_val_dict[column_name]: min_val_dict[column_name] = the_val if the_val > max_val_dict[column_name]: max_val_dict[column_name] = the_val else: sys.exit('Unknown column type') if not out_file is None: new_desc = open(out_file + '.new_desc', 'w') else: new_desc = open('viridis.new_desc', 'w') ts.legend_position = 3 leg_column = 0 for num, i in enumerate(column_list): nameF = TextFace(font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True) nameF.rotation = -90 ts.aligned_header.add_face(nameF, column=num + 1) new_desc.write('H\t' + i.rsplit('_', 1)[0] + '\t' + type_dict[i] + '\t' + str(width_dict[i]) + '\n') x = num * 200 if type_dict[i] == 'colour': ts.legend.add_face(TextFace( font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF', '#FFFFFF'), column=leg_column) for num2, j in enumerate(colour_dict[i]): new_desc.write('C\t' + j + '\t' + colour_dict[i][j] + '\n') ts.legend.add_face(TextFace(font_gap * ' ' + j + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, colour_dict[i][j], colour_dict[i][j]), column=leg_column) leg_column += 2 elif type_dict[i] == 'colour_scale': ts.legend.add_face(TextFace( font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF', '#FFFFFF'), column=leg_column) for num2 in range(11): y = num2 * 20 + 30 val = (max_val_dict[i] - min_val_dict[i]) * num2 / 10.0 h = val / (max_val_dict[i] - min_val_dict[i]) * 270 s = 0.5 l = 0.5 colour = hsl_to_str(h, s, l) ts.legend.add_face(TextFace(font_gap * ' ' + str(val) + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, colour, colour), column=leg_column) leg_column += 2 elif type_dict[i] == 'colour_scale_date': ts.legend.add_face(TextFace( font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF', '#FFFFFF'), column=leg_column) for num2 in range(11): y = num2 * 20 + 30 val = (max_val_dict[i] - min_val_dict[i]) * num2 / 10.0 h = val / (max_val_dict[i] - min_val_dict[i]) * 360 s = 0.5 l = 0.5 colour = hsl_to_str(h, s, l) days = str(int(val / 60 / 60 / 24)) + ' days' ts.legend.add_face(TextFace(font_gap * ' ' + days + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=leg_column + 1) ts.legend.add_face(RectFace(width_dict[i], 20, colour, colour), column=leg_column) leg_column += 2 for n in t.traverse(): if n.is_leaf(): name = leaf_name_dict[n.name] if i in val_dict[name]: val = val_dict[name][i] else: val = 'empty' if type_dict[i] == 'colour': n.add_face(RectFace(width_dict[i], 20, colour_dict[i][val], colour_dict[i][val]), column=num + 1, position="aligned") elif type_dict[i] == 'colour_scale' or type_dict[ i] == 'colour_scale_date': if val == 'empty': colour = '#FFFFFF' else: h = (val - min_val_dict[i]) / ( max_val_dict[i] - min_val_dict[i]) * 360 s = 0.5 l = 0.5 colour = hsl_to_str(h, s, l) n.add_face(RectFace(width_dict[i], 20, colour, colour), column=num + 1, position="aligned") elif type_dict[i] == 'text': n.add_face(TextFace(font_gap * ' ' + val + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=num + 1, position="aligned") if not pres_abs is None: starting_col = len(column_list) + 1 subprocess.Popen('makeblastdb -out tempdb -dbtype prot -in ' + pres_abs[0], shell=True).wait() folder = pres_abs[1] len_dict = {} gene_list = [] ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=starting_col + 1) ts.legend.add_face(RectFace(20, 20, '#FFFFFF', '#FFFFFF'), column=starting_col) ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=starting_col + 1) ts.legend.add_face(RectFace(20, 20, "#5ba965", "#5ba965"), column=starting_col) ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True), column=starting_col + 1) ts.legend.add_face(RectFace(20, 20, "#cb5b4c", "#cb5b4c"), column=starting_col) with open(pres_abs[0]) as f: for line in f: if line.startswith('>'): name = line.split()[0][1:] gene_list.append(name) len_dict[name] = 0 nameF = TextFace(font_gap * ' ' + name + ' ' * font_buffer, fsize=font_size, ftype=font_type, tight_text=True) nameF.rotation = -90 ts.aligned_header.add_face(nameF, column=starting_col + len(gene_list) - 1) else: len_dict[name] += len(line.rstrip()) min_length = 0.9 min_ident = 90 for n in t.iter_leaves(): the_name = n.name if the_name[0] == '"' and the_name[-1] == '"': the_name = the_name[1:-1] if the_name.endswith('.ref'): the_name = the_name[:-4] if not os.path.exists(folder + '/' + the_name): for q in os.listdir(folder): if q.startswith(the_name): the_name = q if not os.path.exists(the_name + '.blast'): subprocess.Popen( 'blastx -query ' + folder + '/' + the_name + ' -db tempdb -outfmt 6 -num_threads 24 -out ' + the_name + '.blast', shell=True).wait() gotit = set() with open(the_name + '.blast') as b: for line in b: query, subject, ident, length = line.split()[:4] ident = float(ident) length = int(length) if ident >= min_ident and length >= min_length * len_dict[ subject]: gotit.add(subject) for num, i in enumerate(gene_list): if i in gotit: colour = "#5ba965" else: colour = "#cb5b4c" n.add_face(RectFace(20, 20, colour, colour), column=num + starting_col, position="aligned") # for num, i in enumerate(gene_list): # x = (starting_col + num) * 200 # svg.writeString(i, x+50, 20, 12) # y = 30 # svg.drawOutRect(x + 50, y, 12, 12, strtorgb('#5ba965'), strtorgb('#5ba965'), lt=0) # svg.writeString('present', x + 70, y + 12, 12) # y = 50 # svg.drawOutRect(x + 50, y, 12, 12, strtorgb('#cb5b4c'), strtorgb('#cb5b4c'), lt=0) # svg.writeString('absent', x + 70, y + 12, 12) # Set these to False if you don't want bootstrap/distance values ts.show_branch_length = label ts.show_branch_support = bootstrap ts.show_leaf_name = False for node in t.traverse(): if node.is_leaf(): node.add_face(AttrFace("name", fsize=font_size, ftype=font_type, tight_text=True, fgcolor='black'), column=0, position="aligned") ts.margin_left = 20 ts.margin_right = 100 ts.margin_top = 20 ts.margin_bottom = 20 if extend: ts.draw_guiding_lines = True ts.scale = the_scale if not circular is None: ts.mode = "c" ts.arc_start = 0 ts.arc_span = 360 if out_file is None: t.show(tree_style=ts) else: t.render(out_file, w=210, units='mm', tree_style=ts)
no_n=re.search(r'_N\d+', line) if no_n: no_n_str= no_n.group() no_n_str=re.sub('_N','',no_n_str) no_N_dict[acc_str]=no_n_str log_file.write("The number of clusters are:" + str(cluster_cnt)) log_file.close() cdhit_file.close() #print "Tree from FastTree program is being used to calculate root to leaf distances..." #Passing in the tree generated by FastTree FastTree=Tree(args.input2) #Getting the root of the tree root=FastTree.get_tree_root() #Loop through each leaf of the tree for leaf in FastTree: #Convert 'leaf' to string to allow manipulation leaf_str=str(leaf) acc_nu=re.search(r'\w{2}\d+.\d{1}_\d{4}|\w{2}_\d+.\d{1}_\d{4}',leaf_str) acc_nu=str(acc_nu.group()) acc_nu=re.sub('_\d{4}$','',acc_nu) rt_lf=FastTree.get_distance(root,leaf) #Make a dictionary using acc_nu as key branlength_dict[acc_nu]=rt_lf #Using the generated dictionaries to print the relevant information to a tab delimited file tsv_file.write(acc_nu + "\t" + year_dict[acc_nu] + "\t" + str(rt_lf) + "\t" + clust_dict[acc_nu] + "\t" + no_N_dict[acc_nu] + "\n") tsv_file.close
# | | | # | /--------| \-F # | | | # | /--------| \-G # | | | # \--------| \-H # | # \-E # # Locate some nodes A = t&"A" C = t&"C" # Calculate distance from current node print "The distance between A and C is", A.get_distance("C") # Calculate distance between two descendants of current node print "The distance between A and C is", t.get_distance("A","C") # Calculate the toplogical distance (number of nodes in between) print "The number of nodes between A and D is ", \ t.get_distance("A","D", topology_only=True) # Calculate the farthest node from E within the whole structure farthest, dist = (t&"E").get_farthest_node() print "The farthest node from E is", farthest.name, "with dist=", dist # Calculate the farthest node from E within the whole structure, # regarding the number of nodes in between as distance value # Note that the result is differnt. farthest, dist = (t&"E").get_farthest_node(topology_only=True) print "The farthest (topologically) node from E is", \ farthest.name, "with", dist, "nodes in between" # Calculate farthest node from an internal node farthest, dist = t.get_farthest_node() print "The farthest node from root is", farthest.name, "with dist=", dist
for line in tt: target_taxa.append(line.rstrip()) tt.close() #now read in a collection of trees, calc branch lengths over sample, summarise and print out branch_lengths = defaultdict(list) #key = taxa, value = list of brlens treefile = open(sys.argv[3]) for line in treefile: curr_tree = Tree(line.rstrip()) root_node = curr_tree.get_common_ancestor(outgroups) if curr_tree != root_node: curr_tree.set_outgroup(root_node) print curr_tree #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name') #print bundle #if bundle[0] == False: # continue #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want! reference_node = curr_tree.get_common_ancestor(target_taxa) #if reference_node != curr_tree: # curr_tree.set_outgroup(reference_node) #calc distance from root to each branch of interest for taxon in target_taxa: dist = curr_tree.get_distance(taxon, reference_node) branch_lengths[taxon].append(dist) #now compute the credible intervals of the branch length for each of the target taxa for taxon in branch_lengths: mean, var, std = stats.bayes_mvs(branch_lengths[taxon], alpha=0.95) print taxon + "\t" + str(mean[0]) + "\t" + str(mean[1][0]) + "\t" + str(mean[1][1])
if a == b: return 0.0 try: return matrix[(a, b)] except KeyError: return matrix[(b, a)] for tip_a, tip_b in itertools.permutations(lineages.keys(), 2): d = sum([n.dist for n in lineages[tip_a] ^ lineages[tip_b]]) matrix[(tip_a, tip_b)] = d #if len(matrix) % 10000 == 0: # print >>sys.stderr, len(matrix) leaves = t.get_leaf_names() print '\t'.join(['#names'] + leaves) for tip_a in leaves: row = [tip_a] for tip_b in leaves: row.append(get_dist(tip_a, tip_b)) print '\t'.join(map(str, row)) # test import random s = random.sample(matrix.keys(), 1000) for a,b in s: d0 = get_dist(a, b) d1 = t.get_distance(a, b) if round(d0, 8) != round(d1, 8): print >>sys.stderr, a, b, d0, d1
tree_file = 'real_data/Yersinia_pestis/tree.nwk' tree_file_out = 'real_data/Yersinia_pestis/tree_%s_left.nwk' res = 1000 infercars_file = f'real_data/Yersinia_pestis/{res}/blocks_unique_coords.infercars' infercars_file_out = f'real_data/Yersinia_pestis/{res}/blocks_unique_coords_%s_left.infercars' t = Tree(tree_file) n = len(t.get_leaves()) m = np.zeros((n, n)) for i, leaf1 in enumerate(t.get_leaves()): for j, leaf2 in enumerate(t.get_leaves()): m[i, j] = t.get_distance(leaf1, leaf2) cls = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=0.0004).fit_predict(m) print(cls) print(np.unique(cls)) used = defaultdict(bool) survivors = [] for cl, leaf in zip(cls, t.get_leaves()): if not used[cl]: used[cl] = True
# | | | # | /--------| \-F # | | | # | /--------| \-G # | | | # \--------| \-H # | # \-E # # Locate some nodes A = t&"A" C = t&"C" # Calculate distance from current node print "The distance between A and C is", A.get_distance("C") # Calculate distance between two descendants of current node print "The distance between A and C is", t.get_distance("A","C") # Calculate the toplogical distance (number of nodes in between) print "The number of nodes between A and D is ", \ t.get_distance("A","D", topology_only=True) # Calculate the farthest node from E within the whole structure farthest, dist = (t&"E").get_farthest_node() print "The farthest node from E is", farthest.name, "with dist=", dist # Calculate the farthest node from E within the whole structure, # regarding the number of nodes in between as distance value # Note that the result is differnt. farthest, dist = (t&"E").get_farthest_node(topology_only=True) print "The farthest (topologically) node from E is", \ farthest.name, "with", dist, "nodes in between" # Calculate farthest node from an internal node farthest, dist = t.get_farthest_node() print "The farthest node from root is is", farthest.name, "with dist=", dist
sfs[pop-1] += 1 nIndsSFS += 1 fn.write(leaf.name+"\t"+"\t".join(str(x) for x in sfs)+"\n") f.close() fn.close() sys.stdout.write('S') #======================================================# # FORCE ULTRAMETRIC in the output tree, defined in reference to the furthest leaf if maxNumberOfSpecies == -1: maxNumberOfSpecies = nTrueSpecies + 1 if force_ultrametric: if nTrueSpecies <= maxNumberOfSpecies: tree_dist = t.get_farthest_leaf()[1] for l in t: dst = t.get_distance(l) if dst != tree_dist: l.dist += tree_dist - dst sys.stdout.write('u') else: sys.stdout.write('\nERROR. Too many final species: will not force ultrametricity.\n') #======================================================# # EXPORT phylo t.write(format=5, outfile=ophylo, dist_formatter='%0.20f') ## write the count of resultant species f = open(ophylo, 'a') f.write("\n"+str(nTrueSpecies)+"\n") f.close() if plot_trees:
ca = tree.get_common_ancestor(eukaryote_seqs) print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) elif answer[0] == False: mono_groups = [] target_group = '' for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"): if target_leaf in node: target_group = node else: mono_groups.append(node) size_target_group = len(target_group) #get distance shortest_distance = 999999999999999.0 closest_other_group = '' for subtree in mono_groups: curr_distance = tree.get_distance(target_group, subtree, topology_only=True) if curr_distance < shortest_distance: shortest_distance = curr_distance closest_other_group = subtree #attempt to calculate distance on a version of the tree in which branches below some support threshold have been deleted # closest_leaves = [] # for leaf in closest_other_group: # closest_leaves.append(leaf.name) # target_leaves = [] # for leaf in target_group: # target_leaves.append(leaf.name) # collapsed_tree = tree # for node in collapsed_tree: # if node.support < 0.5: # node.delete() # target_ca = collapsed_tree.get_common_ancestor(target_leaves)
def concept_similarity_measure_ex1(C1, C2): taxonomy = Tree("skills_taxonomy_tree_level_score.nw") # print("C1",C1,"\n","C2",C2) #taxonomy.show() N1 = 0 # the distance from Concept 1 to the least common subsumer N2 = 0 # the distance from Concept 2 to the least common subsumer N = 0 # the distance from the least common subsumer to the root """ -----------------L he shortest path between the tow concepts------------------------------""" node1 = taxonomy.search_nodes(name=C1) node2 = taxonomy.search_nodes(name=C2) # print('node1', node1) # print('node2', node2) # if the skill is not found in the taxonomy if node1 == [] or node2 == []: # print('skills not in taxonomy') # print(C1, C2) data = [C1, C2] # Vectorise the data vec = TfidfVectorizer() X = vec.fit_transform( data ) # `X` will now be a TF-IDF representation of the data, the first row of `X` corresponds to the first sentence in `data` # Calculate the pairwise cosine similarities (depending on the amount of data that you are going to have this could take a while) S = cosine_similarity(X) similarity = S[0, 1] # print('simmmms',similarity) l1 = l2 = 1.0 # how much should it be else: node1 = node1[0] node2 = node2[0] common = node1.get_common_ancestor(node2) # print(common.is_root()) # print("common is ",common.name) """ ------------------N the distance from root node to the least common subsumer-------------------""" root = taxonomy.get_tree_root() N = taxonomy.get_distance(common, root, topology_only=False) # print("N = the distance between the common ancestor", common.name, "AND ROOT IS ", N) """ ----------------N1 the distance from Concept 1 to the least common subsumer--------------------""" N1 = taxonomy.get_distance(C1, common, topology_only=False) # print("N1 = the distance between",C1,"AND ROOT IS ",N1) """ ----------------N1 the distance from Concept 2 to the least common subsumer--------------------""" N2 = taxonomy.get_distance(C2, common, topology_only=False) # print("N2 = the distance between",C2, "AND ROOT IS ", N2) """ -------------------------------COMPUTE THE MEASURE FORMULA----------------------------------------""" similarity = (2 * N) / (N1 + N2 + (2 * N)) # print("similarity between ", C1,"and",C2, "is", similarity) # print("---------------------------------------------------") l1 = node1.level_score l2 = node2.level_score return similarity, l1, l2
nwk_string = '' for line in nwk: nwk_string += line.rstrip("\n") tree = Tree(nwk_string) # label nodes node_num = 0 for node in tree.traverse("postorder"): if len(node.name) == 0: node.add_features(name=str(node_num)) node_num += 1 #print tree.get_ascii(attributes=["name"], show_internal=True) # parse list of leaves of interest with open(sys.argv[2], 'r') as list: leaf_list = [] for line in list: leaf_list.append(line.rstrip("\n")) # node of interest interest_node_name = sys.argv[3] for node in tree.traverse("postorder"): if node.name == interest_node_name: interest_node = node # print branch lengths for leaf in leaf_list: print leaf + "\t" + str(tree.get_distance(leaf, interest_node))
class PhyloTreeDistanceMatrix(object): _matrix_object_filename = 'phylo_matrix.txt' _tree_matrix_filename = 'tree_dist_matrix.txt' _matrix = {} _all_leaves = [] def __init__(self, newick_file): self._t = Tree(newick_file) def create_distance_matrix_file(self, tree_matrix=_tree_matrix_filename, matrix_object=_matrix_object_filename): """create rooted phylogenetic tree and then use it to generate distance matrix file with distances between nodes""" R = self._t.get_midpoint_outgroup() self._t.set_outgroup(R) # need to use ordered dict to keep order of keys, no need in 3.6 dist_matrix = OrderedDict() # get leaves from tree leaves = [node for node in self._t.get_leaves() if node.is_leaf()] # create distance matrix for leaf0 in leaves: dist_matrix[leaf0.name] = OrderedDict() for leaf1 in leaves: distance = self._t.get_distance(leaf0, leaf1) dist_matrix[leaf0.name][leaf1.name] = distance # save matrix as text file with open(tree_matrix, 'w') as f: l = '' for key in dist_matrix.keys(): for key1 in dist_matrix[key].keys(): d = dist_matrix[key][key1] l += (str(d) + ' ') line = str(key)+ ": " + l + '\n' f.write(line) l = '' line = '' f.close() # save matrix object with open(matrix_object, 'w') as phylo: pickle.dump(dist_matrix, phylo) phylo.close() def load_distance_matrix(self, matrix_file): """load matrix object from specific file""" matrix = pickle.load(matrix_file) self._matrix = matrix def get_matrix_item(self, rowname, colname): """return item from distance matrix specified by row and column""" return self._matrix[rowname][colname] def delete_nodes(self, default_seq_name="1A2P_defal", num_of_leaves=100, tree_file="new_tree.newick"): """delete leaves far from original sequence, after deleting the tree will contain num_of_nodes leaves """ leaves_dict = {} self._all_leaves = [node for node in self._t.get_leaves() if node.is_leaf()] num_leaves = len(self._all_leaves) for item in self._all_leaves: leaves_dict[item.name] = self.get_matrix_item(default_seq_name, item.name) sorted_leaves_dict = sorted(leaves_dict.items(), key=lambda x: x[1], reverse=True) for item in sorted_leaves_dict: deleted_leaf = self._t.search_nodes(name=item[0])[0] if (num_leaves > num_of_leaves): deleted_leaf.delete() num_leaves -= 1 self._t.write(format=1, outfile=tree_file) self.create_distance_matrix_file() def delete_clusters(self, strategy='mean', tree_file="new_tree.newick"): nodes = [node for node in self._t.get_leaves() if node.is_leaf()] sorted_nodes = {} if strategy == 'mean': mean = 0.0 size = len(nodes) for node in nodes: mean += self.get_matrix_item("1A2P_defal", node.name) mean /= size print(mean) for item in nodes: sorted_nodes[item.name] = self.get_matrix_item('1A2P_defal', item.name) sorted_nodes = sorted(sorted_nodes.items(), key=lambda x: x[1], reverse=True) for item in sorted_nodes: if item[1] > mean: deleted_leaf = self._t.search_nodes(name=item[0])[0] deleted_leaf.delete() self._t.write(format=1, outfile=tree_file)
target_taxa.append(line.rstrip()) tt.close() #now read in a collection of trees, calc branch lengths over sample, summarise and print out branch_lengths = defaultdict(list) #key = taxa, value = list of brlens treefile = open(sys.argv[3]) for line in treefile: curr_tree = Tree(line.rstrip()) root_node = curr_tree.get_common_ancestor(outgroups) if curr_tree != root_node: curr_tree.set_outgroup(root_node) print curr_tree #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name') #print bundle #if bundle[0] == False: # continue #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want! reference_node = curr_tree.get_common_ancestor(target_taxa) #if reference_node != curr_tree: # curr_tree.set_outgroup(reference_node) #calc distance from root to each branch of interest for taxon in target_taxa: dist = curr_tree.get_distance(taxon, reference_node) branch_lengths[taxon].append(dist) #now compute the credible intervals of the branch length for each of the target taxa for taxon in branch_lengths: mean, var, std = stats.bayes_mvs(branch_lengths[taxon], alpha=0.95) print taxon + "\t" + str(mean[0]) + "\t" + str(mean[1][0]) + "\t" + str( mean[1][1])
import scipy.spatial.distance from itertools import combinations import xml.etree.ElementTree as ET #Read xml tree = ET.parse('total_fs/stage4/total_fs_linked.tree.xml') root = tree.getroot() for child in root: print(child.tag, child.attrib) for i in root.iter('Tree'): poptree = i.text dendtree = Tree(poptree) #from ete3 leaves = dendtree.get_leaf_names() n = len(leaves) dmat = np.zeros((n,n)) #Generate the matrix for l1,l2 in combinations(leaves,2): d = dendtree.get_distance(l1,l2) dmat[leaves.index(l1),leaves.index(l2)] = dmat[leaves.index(l2),leaves.index(l1)] = d schlink = sch.linkage(scipy.spatial.distance.squareform(dmat),method='average',metric='euclidean', optimal_ordering=True) np.savetxt('DistMat_fromFS.txt', schlink, fmt='%f') #To load #b = np.loadtxt('DistMat_fromFS.txt', dtype=float)