def get_tree_object_in_newick(tree, id_to_sample_dict=None): """Take a tree object, and create a newick formatted representation of it""" new_tree = Tree() new_tree.dist = 0 new_tree.name = "root" node_id = 0 node_id_to_node_in_old_tree = {node_id: tree} node_id_to_node_in_new_tree = {node_id: new_tree} node_ids_to_visit_in_old_tree = [node_id] while node_ids_to_visit_in_old_tree: node_id_in_old_tree = node_ids_to_visit_in_old_tree.pop() node_in_old_tree = node_id_to_node_in_old_tree[node_id_in_old_tree] cl_dist = node_in_old_tree.dist / 2.0 for ch_node_in_old_tree in [node_in_old_tree.left, node_in_old_tree.right]: if ch_node_in_old_tree: ch_for_new_tree = Tree() ch_for_new_tree.dist = cl_dist node_id += 1 node_id_to_node_in_new_tree[node_id] = ch_for_new_tree if ch_node_in_old_tree.is_leaf(): if id_to_sample_dict: ch_for_new_tree.name = id_to_sample_dict[ch_node_in_old_tree.id] else: ch_for_new_tree.name = ch_node_in_old_tree.id else: ch_for_new_tree.name = 'Int' + str(ch_node_in_old_tree.id) node_id_to_node_in_new_tree[node_id_in_old_tree].add_child(ch_for_new_tree) node_id_to_node_in_old_tree[node_id] = ch_node_in_old_tree node_ids_to_visit_in_old_tree.append(node_id) for node in new_tree.traverse("preorder"): if node.is_leaf(): continue has_child_with_dist_or_int = False for child in node.get_children(): if not child.is_leaf() or child.dist > 0: has_child_with_dist_or_int = True break if has_child_with_dist_or_int: continue # swap childs alphabetically node.children = sorted(node.get_children(), key=lambda x:x.name, reverse=True) return new_tree.write(format=1)
def createPseudonodes(node): if node.is_leaf(): return node for child in node.get_children(): createPseudonodes(child) if len(node.get_children()) > 2: dDominantTaxon2Children = {} for child in node.get_children(): sDominantTaxon = 'prokaryota' if 'eukaryota' in child.taxonomy and child.taxonomy[ 'eukaryota'] >= 0.5: sDominantTaxon = 'eukaryota' if sDominantTaxon not in dDominantTaxon2Children: dDominantTaxon2Children[sDominantTaxon] = [] dDominantTaxon2Children[sDominantTaxon].append(child) if len(dDominantTaxon2Children) > 1: for (sDominantTaxon, lDominantTaxonChildren) in dDominantTaxon2Children.items(): if len(lDominantTaxonChildren) > 1: newChild = Tree() newChild.dist = min( map(lambda x: x.dist, node.get_children())) / 2. for child in lDominantTaxonChildren: child.dist -= newChild.dist newChild.add_child(child) node.remove_child(child) node.add_child(newChild) return node
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \ prefix_output = None): if not cache_clustering_file: print "Generating distance matrix..." sys.stdout.flush() Y = dist_matrix(items, dist_fn) print "Linkage clustering..." sys.stdout.flush() Z = linkage(Y, "single") # average, complete = max, single = min ? print "Dumping clustering information into cache file" sys.stdout.flush() cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w")) else: print "Loading clustering cache from '%s'" % cache_clustering_file.name Y, Z = cPickle.load(cache_clustering_file) print "Converting into ETE tree..." sys.stdout.flush() T = to_tree(Z) root = Tree() root.dist = 0 root.name = "root" item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() #try: # ch.add_features(content = str(items[ch_node.id])) #except IndexError: # pass ch.dist = cl_dist ch.name = str(ch_node.id) item2node[node].add_child(ch) item2node[ch_node] = ch to_visit.append(ch_node) return root
def createNode(): """Creates a domain node with required fields precreated""" node = Tree() node.name = 'placeholder' node.add_feature('pos', 0) node.add_feature('event', 'SPECIATION') node.dist = 0 return node
def ASR_parser(args): try: import cPickle as pickle except: import pickle from GCutils import CollapsedForest, CollapsedTree, hamming_distance try: tree = Tree(args.tree, format=1) except Exception as e: print(e) raise TreeFileParsingError('Could not read the input tree. Is this really newick format?') counts = {l.split(',')[0]:int(l.split(',')[1]) for l in open(args.counts)} tree.add_feature('frequency', 0) # Placeholder will be deleted when rerooting tree.add_feature('sequence', 'DUMMY') # Placeholder will be deleted when rerooting tree = map_asr_to_tree(args.asr_seq, args.leaf_seq, tree, args.naive, counts) # Reroot to make the naive sequence the real root instead of just an outgroup: tree = reroot_tree(tree, pattern=args.naive) # Recompute branch lengths as hamming distances: tree.dist = 0 # No branch above root for node in tree.iter_descendants(): node.dist = hamming_distance(node.sequence, node.up.sequence) iqtree_tree = CollapsedTree(tree=tree, name=args.name) # Add colors: if args.colormap is not None: with open(args.colormap, 'rb') as fh: colormap = pickle.load(fh) with open(args.idmap, 'rb') as fh: id_map = pickle.load(fh) # Reverse the id_map: id_map = {cs:seq_id for seq_id, cell_ids in id_map.items() for cs in cell_ids} # Expand the colormap and map to sequence ids: colormap_seqid = dict() for key, color in colormap.items(): if isinstance(key, str) and key in id_map: colormap_seqid[id_map[key]] = color else: for cell_id in key: if cell_id in id_map: colormap_seqid[id_map[cell_id]] = color colormap = colormap_seqid else: colormap = None iqtree_tree.render(args.outbase + '.svg', colormap=colormap) iqtree_forest = CollapsedForest(forest=[iqtree_tree], name=args.name) # Dump tree as newick: iqtree_forest.write_random_tree(args.outbase+'.tree') print('number of trees with integer branch lengths:', iqtree_forest.n_trees) with open(args.outbase + '.p', 'wb') as f: pickle.dump(iqtree_forest, f) print('Done parsing IQ-TREE tree')
def build_conv_topo(annotated_tree, vnodes): tconv = annotated_tree.copy(method="deepcopy") for n in tconv.iter_leaves(): n.add_features(L=1) for n in tconv.traverse(): n.add_features(COPY=0) # get the most recent ancestral node of all the convergent clades l_convergent_clades = tconv.search_nodes(T=True) common_anc_conv=tconv.get_common_ancestor(l_convergent_clades) # duplicate it at its same location (branch lenght = 0). we get # a duplicated subtree with subtrees A and B (A == B) dist_dup = common_anc_conv.dist if not common_anc_conv.is_root(): dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001) dup_point_root = False else: dup_point = Tree() dup_point_root = True dup_point.dist=0.000001 dup_point.add_features(ND=0,T=False, C=False, Cz=False) common_anc_conv.detach() common_anc_conv_copy = common_anc_conv.copy(method="deepcopy") # tag duplicated nodes: for n in common_anc_conv_copy.traverse(): n.COPY=1 if n.ND not in vnodes and not n.is_root(): n.dist=0.000001 # pruned A from all branches not leading to any convergent clade l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, C=False, L=1) #logger.debug("A: %s",l_leaves_to_keep_A) common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True) # pruned B from all branches not leading to any non-convergent clade l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, C=True, L=1) #logger.debug("B : %s", l_leaves_to_keep_B) common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True) dup_point.add_child(common_anc_conv_copy) dup_point.add_child(common_anc_conv) tconv = dup_point.get_tree_root() nodeId = 0 for node in tconv.traverse("postorder"): node.ND = nodeId nodeId += 1 return tconv
def birth(tree, node): #subpop is the subpopulation where the event is to occur, #setpop is the set of nodes in subpop child1, child2 = Tree(), Tree() child1.dist, child2.dist = 0, 0 child1.add_features(extinct=False) child2.add_features(extinct=False) #add children to nodes node.add_child(child1) node.add_child(child2) return tree
def initialise(rate): tree = Tree() tree.add_features(extinct=False) tree.dist = 0.0 node = random.choice(tree.get_leaves()) tree = birth(tree, node) leaf_nodes = tree.get_leaves() wtime = random.expovariate(rate) for leaf in leaf_nodes: if not leaf.extinct: leaf.dist += wtime return tree
def createTree(n): tree = Tree() tree.dist = 0 for i in range(n): node = createNode() node.add_feature('position', i) node.pos = i node.name = "g0_" + str(i) tree.children.append(node) node.up = tree return tree
def ASR_parser(args): try: import cPickle as pickle except: import pickle from gctree import CollapsedForest, CollapsedTree, hamming_distance try: tree = Tree(args.tree) except: raise TreeFileParsingError( 'Could not read the input tree. Is this really newick format?') counts = {l.split(',')[0]: int(l.split(',')[1]) for l in open(args.counts)} tree.add_feature('frequency', 0) # Placeholder will be deleted when rerooting tree.add_feature('sequence', 'DUMMY') # Placeholder will be deleted when rerooting tree = map_asr_to_tree(args.asr_seq, tree, args.naive, counts) # Reroot to make the naive sequence the real root instead of just an outgroup: tree = reroot_tree(tree) # Recompute branch lengths as hamming distances: tree.dist = 0 # No branch above root for node in tree.iter_descendants(): node.dist = hamming_distance(node.sequence, node.up.sequence) igphyml_tree = CollapsedTree(tree=tree) igphyml_tree.render(args.outbase + '.svg') igphyml_forest = CollapsedForest(forest=[igphyml_tree]) print('number of trees with integer branch lengths:', igphyml_forest.n_trees) # check for unifurcations at root unifurcations = sum( tree.tree.frequency == 0 and len(tree.tree.children) == 1 for tree in igphyml_forest.forest) if unifurcations: print( 'WARNING: {} trees exhibit unifurcation from root, which is not possible under current model. Such nodes will be ommitted from likelihood calculation' .format(unifurcations)) with open(args.outbase + '.p', 'wb') as f: pickle.dump(igphyml_forest, f) print('Done parsing IgPhyML tree')
def build_tree(sequences, parents, counts=None, naive='naive'): # build an ete tree # first a dictionary of disconnected nodes nodes = {} for name in sequences: node = Tree() node.name = name node.add_feature('sequence', sequences[node.name]) ### Removed by KD because it is replaced by a count file # if '_' in node.name: # node.add_feature('frequency', int(node.name.split('_')[-1])) # node.name = '_'.join(node.name.split('_')[:-1]) # else: # node.add_feature('frequency', 0) if counts is not None: if node.name in counts: node.add_feature('frequency', counts[node.name]) else: node.add_feature('frequency', 0) nodes[name] = node for name in sequences: if name in parents: nodes[parents[name]].add_child(nodes[name]) else: tree = nodes[name] # reroot on naive if naive is not None: naive_id = [node for node in nodes if naive in node][0] assert len(nodes[naive_id].children) == 0 assert nodes[naive_id] in tree.children tree.remove_child(nodes[naive_id]) nodes[naive_id].add_child(tree) tree = nodes[naive_id] # make random choices for ambiguous bases tree = disambiguate(tree) # compute branch lengths tree.dist = 0 # no branch above root for node in tree.iter_descendants(): node.dist = gctree.hamming_distance(node.sequence, node.up.sequence) return tree
def build_tree(sequences, parents, counts=None, naive='naive'): # build an ete tree # first a dictionary of disconnected nodes nodes = {} for name in sequences: node = Tree() node.name = name node.add_feature('nuc_seq', sequences[node.name]) node.add_feature('aa_seq', local_translate(sequences[node.name])) if counts is not None and node.name in counts: node.add_feature('frequency', counts[node.name]) else: node.add_feature('frequency', 0) nodes[name] = node for name in sequences: if name in parents: nodes[parents[name]].add_child(nodes[name]) else: tree = nodes[name] # Reroot on naive: if naive is not None: naive_id = [n for n in nodes if naive in n][0] assert len(nodes[naive_id].children) == 0 naive_parent = nodes[naive_id].up naive_parent.remove_child(nodes[naive_id]) nodes[naive_id].add_child(naive_parent) # remove possible unecessary unifurcation after rerooting if len(naive_parent.children) == 1: naive_parent.delete(prevent_nondicotomic=False) naive_parent.children[0].dist = hamming_distance( naive_parent.children[0].nuc_seq, nodes[naive_id].nuc_seq) tree = nodes[naive_id] # make random choices for ambiguous bases tree = disambiguate(tree) # compute branch lengths tree.dist = 0 # no branch above root for node in tree.iter_descendants(): node.dist = hamming_distance(node.nuc_seq, node.up.nuc_seq) return tree
def partitionTreeSet(N): if N == 1: x = Tree(";",format=100) x.add_features(value=N, name=str(N)) xFace = styleFace(x.name) x.add_face(xFace,column=0,position="branch-top") return (x,) else: y = () base = Tree(";",format=100) base.dist = 1 for k in range(lam(N)): left = partitionTreeSet(N-(k+1)) right = partitionTreeSet(k+1) for l in left: for r in right: l.dist = 1 r.dist = 1 z = base.copy() z.dist = 1 z.add_features(value=N, name=str(N)) z.add_child(l.copy()) z.add_child(r.copy()) zFace = styleFace(z.name) z.add_face(zFace,column=0,position="branch-top") y = y + (z,) return y
def parse_union_tree(history_1, history_2, base_tree_path, debug=False): base_tree = Tree(base_tree_path, format=1) # add for debugging base_tree.get_tree_root().name = "_baseInternal_30" united_tree = Tree() united_tree.dist = 0 # initialize distance to 0 united_tree.get_tree_root().name = history_1.get_tree_root( ).name # set the name of the root united_tree.add_feature("history_1_label", history_1.get_tree_root().label) united_tree.add_feature("history_2_label", history_2.get_tree_root().label) union_nodes_number = 0 for original_node in base_tree.traverse( "preorder" ): # traverse the tree in pre-order to assure that for any visited node, its parent from the base branch is already in the united tree original_parent = original_node.up if original_parent != None: # will be none only in the case the original node is the root if debug: print("handled branch: (", original_node.name, ",", original_parent.name, ")") curr_union_parent = united_tree.search_nodes( name=original_parent.name)[0] hist_1_done = True hist_1_curr_child = None hist_1_parent = history_1.search_nodes(name=original_parent.name)[ 0] # need to check names consistency across the 3 trees for child in hist_1_parent.children: if len(base_tree.search_nodes(name=child.name)) == 0 and len( child.search_nodes(name=original_node.name) ) > 0: # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest hist_1_curr_child = child hist_1_done = False break if hist_1_done: hist_1_curr_child = history_1.search_nodes( name=original_node.name)[0] hist_1_current_label = hist_1_curr_child.label hist_2_done = True hist_2_curr_child = None hist_2_parent = history_2.search_nodes(name=original_parent.name)[ 0] # need to check names consistency across the 3 trees for child in hist_2_parent.children: if len(base_tree.search_nodes(name=child.name)) == 0 and len( child.search_nodes(name=original_node.name) ) > 0: # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest hist_2_curr_child = child hist_2_done = False break if hist_2_done: hist_2_curr_child = history_2.search_nodes( name=original_node.name)[0] hist_2_current_label = hist_2_curr_child.label while not hist_1_done or not hist_2_done: hist_1_dist = float("inf") hist_2_dist = float("inf") if not hist_1_done: # if there is a node closer to the original node in history 1 -> add it to the united tree first hist_1_dist = hist_1_curr_child.get_distance( original_parent.name) - curr_union_parent.get_distance( original_parent.name) if not hist_2_done: hist_2_dist = hist_2_curr_child.get_distance( original_parent.name) - curr_union_parent.get_distance( original_parent.name) if debug: if not hist_1_done: print("history 1 has current child of ", original_parent.name, ": ", hist_1_curr_child.name, " with label: ", hist_1_current_label, " and distance from parent is: ", hist_1_dist) if not hist_2_done: print("history 2 has current child of ", original_parent.name, ": ", hist_2_curr_child.name, " with label: ", hist_2_current_label, " and distance from parent is: ", hist_2_dist) # first, check if now the two current children have the same name, and if this name is in the base tree - exit if hist_1_curr_child.name == hist_2_curr_child.name and len( base_tree.search_nodes( name=hist_1_curr_child.name)) > 0: break # else, at least one of the histories has more than one step to go before reaching the bottom of the branch if hist_1_dist < hist_2_dist: # add the node from history 1 and travel down to the next node in history 1 if debug: print( "adding child from history 1 which precedes to the one from history 2" ) print("the label of the added node in history 1 is: ", hist_1_curr_child.label) print( "the label of the added node in histroy 2 remains like papa: ", hist_2_current_label) curr_union_parent = curr_union_parent.add_child( child=None, name="internal_" + str(union_nodes_number), dist=hist_1_dist, support=None) curr_union_parent.add_feature("history_1_label", hist_1_curr_child.label) curr_union_parent.add_feature("history_2_label", hist_2_current_label) hist_1_parent = hist_1_curr_child if len(hist_1_parent.children) == 1: hist_1_curr_child = hist_1_parent.children[0] else: hist_1_done = True if debug: print("united tree is now: \n", united_tree) if hist_1_done: print( "history 1 on the handled branch is complete") else: print( "history 1 on the handled branch isn't complete yet" ) else: # add the node from history 2 and travel down to the next node in history 2 if debug: print( "adding child from history 2 which precedes to the one from history 1" ) print("the label of the added node in history 2 is: ", hist_2_curr_child.label) print( "the label of the added node in history 1 remains like papa: ", hist_1_current_label) curr_union_parent = curr_union_parent.add_child( child=None, name="internal_" + str(union_nodes_number), dist=hist_2_dist) # added as a new branch curr_union_parent.add_feature("history_1_label", hist_1_current_label) curr_union_parent.add_feature("history_2_label", hist_2_curr_child.label) hist_2_parent = hist_2_curr_child if len(hist_2_parent.children) == 1: hist_2_curr_child = hist_2_parent.children[0] else: hist_2_done = True if debug: print("united tree is now: \n", united_tree) if hist_2_done: print( "history 2 on the handled branch is complete") else: print( "history 2 on the handled branch isn't complete yet" ) union_nodes_number += 1 # now add the original node as the child of the current parent original_dist = original_node.dist residual = original_dist - curr_union_parent.get_distance( united_tree.search_nodes(name=original_parent.name)[0]) curr_union_parent = curr_union_parent.add_child( child=None, name=original_node.name, dist=residual) curr_union_parent.add_feature( "history_1_label", history_1.search_nodes(name=original_node.name)[0].label) curr_union_parent.add_feature( "history_2_label", history_2.search_nodes(name=original_node.name)[0].label) return united_tree
t = Tree() t.populate(N_LEAVES, random_branches=True, branch_range=(0.5, 1.5)) print(get_tree_length(t)) rescale_tree( t, float(EXPECTED_N_MUTATIONS_PER_BRANCH) * float(N_BRANCHES) / float(GENOME_LENGTH)) print(get_tree_length(t)) with open(f"{OUTPUT_FOLDER}/tree_{i}.tree", "w") as tree_out: tree_out.write(t.write()) # create a root genome with given root frequencies, using an empty tree # this also does a phastSim simulation, but ignore it null_tree = Tree() null_tree.populate(0, names_library=["ref"]) null_tree.dist = 0.0 with open(f"{OUTPUT_FOLDER}/null_tree.tree", "w") as null_tree_file: null_tree_file.write(null_tree.write()) os.system(f"""phastSim \ --rootGenomeLength {GENOME_LENGTH} \ --rootGenomeFrequencies {ROOT_GENOME_FREQUENCIES_STRING.replace("+", " ")} \ --treeFile {OUTPUT_FOLDER}/null_tree.tree \ --outpath {OUTPUT_FOLDER}/ \ --outputFile my_ref \ --createFasta {PHASTSIM_OPTIONS.replace("+", " ")} \ --seed {np.random.randint(1000000000)} """) reference = SeqIO.read(f"{OUTPUT_FOLDER}/my_ref.fasta", format="fasta")
def evolveAlongTree(host, guest, reverseMap, rootSequence, hmmfile, emissionProbs, transmat): """ Evolves a root sequence along an entire host tree, taking into account the domain level events present in the guest tree (duplication, loss, speciation) Args: host (Tree) : The host tree (ete3 format) inside which the guest tree evolved guest (Tree) : The guest tree over which to evolve a sequence reverseMap (dict) : mapping from nodes in the host node -> guest nodes rootSequence (str): Initial sequence to evolve. Should contain sequence with ONE domain hmmfile (str ) : path to hmmfile used to identify domains emissionProbs : matrix with dimensions (n x 20) where n is the length of the domain. Each row contains the probability of each aa appearing at that position (in pfam hmm order) """ for node in host.traverse(): node.add_feature('sequence', "") for hostNode in host.traverse(): tempSequence = rootSequence if hostNode == host else hostNode.up.sequence #No events occured at this node if hostNode not in reverseMap: hostNode.sequence = evolveSequence(tempSequence, 0.05, hostNode.dist, \ emissionProbs, hmmfile, transmat) continue allGuestNodes = reverseMap[hostNode] allGuestNodesSet = set(allGuestNodes) upAncestors, leafChildren = {}, {} for guestNode in allGuestNodes: if guestNode.up not in allGuestNodesSet: upAncestors[guestNode] = guestNode.up #pass positional information on from the previous species if guestNode.up != None: guestNode.add_feature('position', guestNode.up.position) guestNode.up = None if guestNode.children != [] and guestNode.children[ 0] not in allGuestNodesSet: leafChildren[guestNode] = guestNode.children guestNode.children = [] if hostNode != host: t = Tree() t.dist = 0 t.children = upAncestors.keys() for guestNode in upAncestors.keys(): guestNode.up = t else: t = guest #Actually do the work tempSequence = domainOrder(tempSequence, .75, hmmfile, emissionProbs, t, hostNode.name, transmat) hostNode.sequence = tempSequence #Reconnect all root and leaf nodes to the rest of the guest tree for node in upAncestors: node.up = upAncestors[node] for node in leafChildren: node.children = leafChildren[node]
def birthDeathTree(birthRate, deathRate, treeHeight): """ Generates a tree topology according to the birth-death model. Args: birthRate (float): birth rate deathRate (float): death rate treeHeight (float): The average overall length of a root to leaf path numLeaves (int): The number of leaves desired at the end of the run. If the input is <= 0, this parameter is ignored and """ birthRate = float(birthRate) deathRate = float(deathRate) host = Tree() host.dist = 0 lineages = [(host, treeHeight)] while lineages != []: #waiting time is exp(b + d), P(b) = b/(b+d), P(d) = 1 - P(b) node, height = lineages.pop(0) eventTime = stats.exp(1. / (1. / birthRate + 1. / deathRate)) #event occurs if eventTime <= height: #duplication if np.random.random() < birthRate / (birthRate + deathRate): left = node.add_child(dist=eventTime) right = node.add_child(dist=eventTime) lineages.append((left, height - eventTime)) lineages.append((right, height - eventTime)) #loss: Remove from queue, delete node later (cleanup process) else: node.dist *= -1 #If no event occurs, credit remaining branch length to this node else: node.dist += height if host.children == []: host.name = "h0" return host #remove lost nodes for node in host.traverse(): if node.dist < 0: node.up.remove_child(node) #remove nodes with only one child (ensure full binary tree) for node in [a for a in host.traverse()]: if len(node.children) == 1: #This is the root node if node.up == None: host.children[0].dist += host.dist host = host.children[0] host.up = None else: parent = node.up child = node.children[0] child.dist += node.dist child.up = parent parent.remove_child(node) parent.children.append(child) nameCounter = 0 for node in host.traverse(): node.name = "h" + str(nameCounter) nameCounter += 1 return host
def parse_union_tree(history_1, history_2, base_tree_path, debug=False): base_tree = Tree(base_tree_path, format=1) base_tree.get_tree_root().name = "root" united_tree = Tree() united_tree.dist = 0 # initialize distance to 0 united_tree.get_tree_root().name = history_1.get_tree_root( ).name # set the name of the root united_tree.add_feature("history_1_label", history_1.get_tree_root().label) united_tree.add_feature("history_2_label", history_2.get_tree_root().label) union_nodes_number = 0 for original_node in base_tree.traverse( "preorder" ): # traverse the tree in pre-order to assure that for any visited node, its parent from the base branch is already in the united tree original_parent = original_node.up if original_parent != None: # will be none only in the case the original node is the root if debug: print("handled branch: (", original_node.name, ",", original_parent.name, ")") curr_union_parent = united_tree.search_nodes( name=original_parent.name.rstrip())[0] hist_1_done = True hist_1_curr_child = None hist_1_parent = history_1.search_nodes( name=original_parent.name.rstrip())[ 0] # need to check names consistency across the 3 trees for child in hist_1_parent.children: if len( base_tree.search_nodes(name=child.name) ) == 0 and len(child.get_children()) == 1 and len( child.search_nodes(name=original_node.name) ) > 0: # if the child does not exist in the base tree, it represents a mapping node that was created out of breaking a branch in the original tree hist_1_curr_child = child hist_1_done = False break if hist_1_done: hist_1_curr_child = history_1.search_nodes( name=original_node.name.rstrip())[0] hist_1_current_label = hist_1_curr_child.label hist_2_done = True hist_2_curr_child = None hist_2_parent = history_2.search_nodes( name=original_parent.name.rstrip())[ 0] # need to check names consistency across the 3 trees for child in hist_2_parent.children: if len( base_tree.search_nodes(name=child.name) ) == 0 and len(child.get_children()) == 1 and len( child.search_nodes(name=original_node.name) ) > 0: #: # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest hist_2_curr_child = child hist_2_done = False # should be false for _baseInternal_52 break if hist_2_done: try: hist_2_curr_child = history_2.search_nodes( name=original_node.name.rstrip())[0] except: name = original_node.name.rstrip() original_children = original_node.get_children() exit(1) hist_2_current_label = hist_2_curr_child.label original_dist = original_node.dist while not hist_1_done or not hist_2_done: if hist_1_curr_child.name == hist_2_curr_child.name and hist_1_curr_child.name == original_node.name: # both have reached the original child print( "error! original child wasn't recognized in the end of the loop" ) exit(1) hist_1_dist = history_1.search_nodes( name=original_node.name.rstrip())[0].dist hist_2_dist = history_2.search_nodes( name=original_node.name.rstrip())[0].dist if not hist_1_done: # if there is a node closer to the original node in history 1 -> add it to the united tree first hist_1_dist = hist_1_curr_child.get_distance( original_parent.name) - curr_union_parent.get_distance( original_parent.name) if not hist_2_done: hist_2_dist = hist_2_curr_child.get_distance( original_parent.name) - curr_union_parent.get_distance( original_parent.name) if debug: if not hist_1_done: print("history 1 has current child of ", original_parent.name, ": ", hist_1_curr_child.name, " with label: ", hist_1_current_label, " and distance from parent is: ", hist_1_dist) if not hist_2_done: print("history 2 has current child of ", original_parent.name, ": ", hist_2_curr_child.name, " with label: ", hist_2_current_label, " and distance from parent is: ", hist_2_dist) # first, check if now the two current children have the same name, and if this name is in the base tree - exit if hist_1_curr_child.name == hist_2_curr_child.name and len( base_tree.search_nodes( name=hist_1_curr_child.name)) > 0: break # else, at least one of the histories has more than one step to go before reaching the bottom of the branch if hist_1_dist < hist_2_dist: # add the node from history 1 and travel down to the next node in history 1 if debug: print( "adding child from history 1 which precedes to the one from history 2" ) print("the label of the added node in history 1 is: ", hist_1_curr_child.label) print( "the label of the added node in history 2 remains like papa: ", hist_2_current_label) curr_union_parent = curr_union_parent.add_child( child=None, name="internal_" + str(union_nodes_number), dist=hist_1_dist, support=None) curr_union_parent.add_feature("history_1_label", hist_1_curr_child.label) curr_union_parent.add_feature("history_2_label", hist_2_current_label) hist_1_parent = hist_1_curr_child if len(hist_1_parent.children) == 1: hist_1_curr_child = hist_1_parent.children[0] if hist_1_curr_child.name == original_node.name: hist_1_done = True else: # two children only occur when reaching a junction from the base tree hist_1_done = True if debug: if hist_1_done: print( "history 1 on the handled branch is complete") continue else: print( "history 1 on the handled branch isn't complete yet" ) else: # add the node from history 2 and travel down to the next node in history 2 if debug: print( "adding child from history 2 which precedes to the one from history 1" ) print("the label of the added node in history 2 is: ", hist_2_curr_child.label) print( "the label of the added node in history 1 remains like papa: ", hist_1_current_label) curr_union_parent = curr_union_parent.add_child( child=None, name="internal_" + str(union_nodes_number), dist=hist_2_dist) # added as a new branch curr_union_parent.add_feature("history_1_label", hist_1_current_label) curr_union_parent.add_feature("history_2_label", hist_2_curr_child.label) hist_2_parent = hist_2_curr_child if len(hist_2_parent.children) == 1: hist_2_curr_child = hist_2_parent.children[0] if hist_2_curr_child.name == original_node.name: hist_2_done = True else: hist_2_done = True if debug: if hist_2_done: print( "history 2 on the handled branch is complete") continue else: print( "history 2 on the handled branch isn't complete yet" ) union_nodes_number += 1 # now add the original node as the child of the current parent residual = original_dist - curr_union_parent.get_distance( united_tree.search_nodes( name=original_parent.name.rstrip())[0]) if residual < 0: print("error on residual computation for branch leading to ", original_node.name) print("residual: ", residual) print("original_dist: ", original_dist) print( "curr_union_parent.get_distance(united_tree.search_nodes(name=original_parent.name.rstrip())[0]): ", curr_union_parent.get_distance( united_tree.search_nodes( name=original_parent.name.rstrip())[0])) exit(1) curr_union_parent = curr_union_parent.add_child( child=None, name=original_node.name, dist=residual) curr_union_parent.add_feature( "history_1_label", history_1.search_nodes( name=original_parent.name.rstrip())[0].label) curr_union_parent.add_feature( "history_2_label", history_2.search_nodes( name=original_parent.name.rstrip())[0].label) if debug: for node in united_tree.traverse("postorder"): print("node=", node.name) print("label in hist1=", node.history_1_label) print("label in hist2=", node.history_2_label) print("branch length=", node.dist) return united_tree
def __gen_tree(**kwargs): """ Internal function for tree generation. This is an internal function for the tree generation, whose main difference to `gen_tree()`, the one exposed to the user, is that it does not guarantee that a tree will be generated, as the parameters and the random sampling might lead to dead-ends where all the leaves in a tree are extinct before any or all the stopping criteria are met. As an internal function, it does not set default values to the arguments and does not perform any checking on the values. Information on the arguments, which have the same variable names and properties, are given in the documentation for `gen_tree()`. """ # Initialize the RNG utils.set_seeds(kwargs["seed"]) # Compute the overall event rate (birth plus death), from which the # random expovariate will be drawn. `birth` is here normalized in range # [0..1] so that we can directly compare with the results of # `.random()` and decide if the event is a birth or a death. # `death` does not need to be normalized, as it is not used anymore (the # only check, below, is `.random() <= birth`). event_rate = kwargs["birth"] + kwargs["death"] birth = kwargs["birth"] / event_rate # Create the tree root as a node. Given that the root is at first set as # non-extinct and with a branch length of 0.0, it will be immediately # subject to either a speciation or extinction event. tree = Tree() tree.dist = 0.0 tree.extinct = False # Iterate until an acceptable tree is generated (breaking the loop with # a tree) or all leaves go extinct (breaking the loop with `tree` as None). # `total_time`, of which we keep track in case `max_time` is provided, # is the total evolution time (sum of branch lengths) from the root to the # extant nodes. total_time = 0.0 while True: # Get the list of extant leaves leaf_nodes = __extant(tree) # Compute the event time before the next birth/death event from a # random exporaviate reflecting the number of extant leaves and the # combined event probability. event_time = random.expovariate(len(leaf_nodes) * event_rate) # Update the total evolution time. If a maximum alloted time # `max_time` is provided and we overshoot it, break the loop # without implementing the event (as, by the random event time, it # would take place *after* our maximum time, in the future). total_time += event_time if kwargs["max_time"] and total_time > kwargs["max_time"]: break # Select a random node among the extant ones and set it as extinct # before simulating either a birth or death event; the type of # event is decided based on the comparison of the result of a # `random.random()` call with `birth` (here already normalized in # relation to `event_rate`) node = np.random.choice(leaf_nodes) node.extinct = True if np.random.random() <= birth: # The event will be a birth (i.e., speciation one), with at least # two children (the number is increased by a random sample from a # Poisson distribution using the `lam` parameter, so that # hard politomies are possible). The distance # of the children is here initially set to zero, and will be # increased by `event_time` in the loop below, along with all # other extant nodes. for _ in range(2 + np.random.poisson(kwargs["lam"])): child_node = Tree() child_node.dist = 0 child_node.extinct = False node.add_child(child_node) # (Re)Extract the list of extant nodes, now that we might have new # children and that the randomly selected node went extinct # (easier than directly manipulating the Python list). From the # updated list, we will extend the branch length of all extant leaves # (thus including any new children) by the `event_time` computed # above. leaf_nodes = __extant(tree) for leaf in leaf_nodes: new_leaf_dist = leaf.dist + event_time leaf.dist = min(new_leaf_dist, (kwargs["max_time"] or new_leaf_dist)) # If the event above was a death event, we might be in the undesirable # situation where all lineages went extinct before we # could finish the random generation according to the # user-requested parameters, so that one or both stopping criteria # cannot be satisfied. A solution could # be to recursively call this function, with the same # parameters, until a valid tree is found, but this is not # optimal (nor elegant) and might get us stuck in a # loop if we don't keep track of the number of iterations # (especially if we got to this point by using a # user-provided random seed and/or set of unfortunate parameters). # In face of that, it is preferable to be explicit about the problem by # returning a `None` value, with the user (or a wrapper # function) being in charge of asserting that the desired # number of random trees is collected (even if it is a single one). if not leaf_nodes: tree = None break # Check whether one or both the stopping criteria were reached if kwargs["min_leaves"] and len(leaf_nodes) >= kwargs["min_leaves"]: break if kwargs["max_time"] and total_time >= kwargs["max_time"]: break # In some cases we might end up with technically valid trees composed # only of the root. We make sure at least one speciation event took # place, returning `None` as failure in other cases. if tree and len(__extant(tree)) <= 2: tree = None # Prune the tree, removing extinct leaves, if requested and if a # tree was found. Remember that the ete3 `prune()` method takes a list # of the nodes that will be kept, removing the other ones. if kwargs["prune"] and tree: tree.prune(__extant(tree)) # Label the tree before returning it, if it was provided if kwargs["labels"] and tree: label_tree(tree, kwargs["labels"], seed=kwargs["seed"]) return tree
def disambiguate(tree: Tree, random_state=None) -> Tree: """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on subtrees of consecutive ambiguity codes.""" if random_state is None: random.seed(tree.write(format=1)) else: random.setstate(random_state) for node in tree.traverse(): for site, base in enumerate(node.sequence): if base not in gctree.utils.bases: def is_leaf(node): return (node.is_leaf()) or (node.sequence[site] in gctree.utils.bases) # First pass of Sankoff: compute cost vectors for node2 in node.traverse(strategy="postorder", is_leaf_fn=is_leaf): base2 = node2.sequence[site] node2.add_feature("cv", code_vectors[base2].copy()) if not is_leaf(node2): for i in range(5): for child in node2.children: node2.cv[i] += min([ sum(v) for v in zip( child.cv, cost_adjust[ gctree.utils.bases[i]]) ]) # Second pass: Choose base and adjust children's cost vectors if not node.is_root(): node.cv = [ sum(v) for v in zip( node.cv, cost_adjust[node.up.sequence[site]]) ] # traverse evaluates is_leaf(node) after yielding node. # Resolving base makes is_leaf true; must get order before # making changes. preorder = list( node.traverse(strategy="preorder", is_leaf_fn=is_leaf)) for node2 in preorder: if node2.sequence[site] in gctree.utils.bases: continue min_cost = min(node2.cv) base_index = random.choice([ i for i, val in enumerate(node2.cv) if val == min_cost ]) new_base = gctree.utils.bases[base_index] # Adjust child cost vectors if not is_leaf(node2): for child in node2.children: child.cv = [ sum(v) for v in zip(child.cv, cost_adjust[new_base]) ] node2.sequence = (node2.sequence[:site] + new_base + node2.sequence[(site + 1):]) for node in tree.traverse(): try: node.del_feature("cv") except (AttributeError, KeyError): pass tree.dist = 0 for node in tree.iter_descendants(): node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence) return tree
def get_tree_object_in_newick(tree, id_to_sample_dict=None): """Take a tree object, and create a newick formatted representation of it""" new_tree = Tree() new_tree.dist = 0 new_tree.name = "root" node_id = 0 node_id_to_node_in_old_tree = {node_id: tree} node_id_to_node_in_new_tree = {node_id: new_tree} node_ids_to_visit_in_old_tree = [node_id] while node_ids_to_visit_in_old_tree: node_id_in_old_tree = node_ids_to_visit_in_old_tree.pop() node_in_old_tree = node_id_to_node_in_old_tree[node_id_in_old_tree] cl_dist = node_in_old_tree.dist / 2.0 for ch_node_in_old_tree in [node_in_old_tree.left, node_in_old_tree.right]: if ch_node_in_old_tree: ch_for_new_tree = Tree() ch_for_new_tree.dist = cl_dist node_id += 1 node_id_to_node_in_new_tree[node_id] = ch_for_new_tree if ch_node_in_old_tree.is_leaf(): if id_to_sample_dict: ch_for_new_tree.name = id_to_sample_dict[ch_node_in_old_tree.id] else: ch_for_new_tree.name = ch_node_in_old_tree.id else: # we used to export our trees with internal node labels so we could # do various interface operations more easily: # # ch_for_new_tree.name = 'Int' + str(ch_node_in_old_tree.id) # # but our new interface design does not require such addditions to # dendrograms. Although here we add 0 branch support for our # dendrograms since we wish to use a standard format to export these # data as a tree. ch_for_new_tree.support = 0.0 node_id_to_node_in_new_tree[node_id_in_old_tree].add_child(ch_for_new_tree) node_id_to_node_in_old_tree[node_id] = ch_node_in_old_tree node_ids_to_visit_in_old_tree.append(node_id) for node in new_tree.traverse("preorder"): if node.is_leaf(): continue has_child_with_dist_or_int = False for child in node.get_children(): if not child.is_leaf() or child.dist > 0: has_child_with_dist_or_int = True break if has_child_with_dist_or_int: continue # swap childs alphabetically node.children = sorted(node.get_children(), key=lambda x:x.name, reverse=True) return new_tree.write(format=2)
#!python3 import os import argparse import pandas as pd from ete3 import Tree if __name__ == '__main__': parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-n', '--nwk', default="../DataEmpirical/PrimatesBinaryLHTShort/rootedtree.nwk", required=False, type=str, dest="nwk") args = parser.parse_args() nwk = Tree(args.nwk, format=1) root_age = nwk.get_closest_leaf()[1] nwk.dist = nwk.dist / root_age for n in nwk.iter_descendants(): print("{0}: {1}".format(n.name, n.dist / root_age)) n.dist = n.dist / root_age nwk.write(format=1, outfile=args.nwk + ".scaled.nwk")