def test_serialization(self): N, colors = 30, 5 repeats = 20 for _ in range(repeats): tree = random_colored_tree(N, colors) tree1 = tree.copy() tree.serialize('testfile_tree.pickle') tree.serialize('testfile_tree.json') tree2 = Tree.load('testfile_tree.pickle') tree3 = Tree.load('testfile_tree.json') os.remove('testfile_tree.pickle') os.remove('testfile_tree.json') tree_nodes = [v.label for v in tree.preorder()] tree1_nodes = [v.label for v in tree1.preorder()] tree2_nodes = [v.label for v in tree2.preorder()] tree3_nodes = [v.label for v in tree3.preorder()] self.assertListEqual(tree_nodes, tree1_nodes) self.assertListEqual(tree_nodes, tree2_nodes) self.assertListEqual(tree_nodes, tree3_nodes)
def wrong_topology_matrix(OGT): """Return a wrong topology matrix by rearranging the edges of a binary tree.""" distances = [v.dist for v in OGT.preorder()][1:] # do not include root, if len(distances) % 2 != 0: # observable gene tree (OGT) print("List of distances is not even!") # is binary and not planted, return # hence |E| should be even random.shuffle(distances) random_tree = Tree(TreeNode(label=0, dist=0.0)) id_counter = 1 current_leaves = [random_tree.root] while distances: v = current_leaves.pop(random.randint(0, len(current_leaves) - 1)) dist1, dist2 = distances.pop(), distances.pop() new_child1 = TreeNode(label=id_counter, dist=dist1) new_child2 = TreeNode(label=id_counter + 1, dist=dist2) v.add_child(new_child1) v.add_child(new_child2) current_leaves.extend(v.children) id_counter += 1 random_leaves = [l for l in random_tree.leaves() ] # implicit random bijection random.shuffle(random_leaves) # to original tree _, D = distance_matrix(random_tree, leaf_order=random_leaves) return D
def _yule(N, birth_rate): if birth_rate is None: birth_rate = 1.0 elif birth_rate <= 0.0: raise ValueError("birth rate must be >0") tree = Tree(TreeNode(label=0, event='S', dist=0.0, tstamp=0.0)) tree.number_of_species = N branches = [(1, tree.root)] forward_time = 0.0 node_counter = 1 while len(branches) < N: rate = len(branches) * birth_rate forward_time += np.random.exponential(1 / rate) i = np.random.randint(len(branches)) branch_id, parent = branches[i] spec_node = TreeNode(label=branch_id, event='S', dist=forward_time - parent.tstamp, tstamp=forward_time) parent.add_child(spec_node) branches[i] = (node_counter, spec_node) branches.append((node_counter + 1, spec_node)) node_counter += 2 # add length for pendant branches (cf. Hartmann et al. 2010) forward_time += np.random.exponential(1 / rate) # finalize the branches for branch_id, parent in branches: parent.add_child( TreeNode(label=branch_id, event='S', dist=forward_time - parent.tstamp, tstamp=forward_time)) _reverse_time_stamps(tree) return tree
def load(directory): files = glob.glob(directory + '/scenario*.pickle') trees = [] for i in range(len(files)): with open('{}/scenario{}.pickle'.format(directory, i), 'rb') as f: T_nx = pickle.load(f) T = Tree.parse_nx(*T_nx) trees.append(T) return trees
def _finalize(tree, G): if not tree: return None if isinstance(tree, TreeNode): tree = Tree(tree) # assign colors to the leaves reconstruct_color_from_graph(tree, G) return tree
def _yule_age(age, birth_rate): if birth_rate is None: birth_rate = 1.0 elif birth_rate <= 0.0: raise ValueError("birth rate must be >0") tree = Tree(TreeNode(label=0, event='S', dist=0.0, tstamp=0.0)) branches = [(1, tree.root)] forward_time = 0.0 node_counter = 1 while forward_time < age: rate = len(branches) * birth_rate forward_time += np.random.exponential(1 / rate) # do not branch if age is already reached if forward_time >= age: break i = np.random.randint(len(branches)) branch_id, parent = branches[i] spec_node = TreeNode(label=branch_id, event='S', dist=forward_time - parent.tstamp, tstamp=forward_time) parent.add_child(spec_node) branches[i] = (node_counter, spec_node) branches.append((node_counter + 1, spec_node)) node_counter += 2 # finalize the branches for branch_id, parent in branches: parent.add_child( TreeNode(label=branch_id, event='S', dist=age - parent.tstamp, tstamp=age)) # reverse such that t(root) = age and t(leaves) = 0.0 _reverse_time_stamps(tree) return tree
def correct_bmg(bmg_original): """Build the LRT (using min cut in BUILD algorithm) and return its BMG.""" subtrees = [] for sg in (bmg_original.subgraph(c) for c in nx.weakly_connected_components(bmg_original)): tree = lrt_from_colored_graph(sg, mincut=True) if tree: subtrees.append(tree) if len(subtrees) == 0: return None elif len(subtrees) == 1: tree = subtrees[0] else: tree = Tree(TreeNode()) for subtree in subtrees: tree.root.add_child(subtree.root) return bmg_from_tree(tree)
def simulate(): for i in range(*repeats): for d in distances: root = TreeNode(label=0, dist=0.0) child = TreeNode(label=1, dist=d) root.add_child(child) T = Tree(root) for subst_model in subst_models: evolver = Evolver(subst_model, jump_chain=False) for l in lengths: outfile = "{}/{}_{}_{}_{}.phylip".format(directory, subst_model.model_name, d, l, i) print("Processing '{}'".format(outfile)) evolver.evolve_along_tree(T, start_length=l) if subst_model.model_name in ('WAG', 'JTT'): # treepuzzle accepts only >= 3 sequences alignment = evolver.true_alignment(include_inner=True) for k, v in alignment.items(): seq = v break alignment[TreeNode(label=2)] = seq alignment[TreeNode(label=3)] = seq write_alignment(outfile, alignment, alignment_format='phylip') else: evolver.true_alignment(include_inner=True, write_to=outfile)
def _initiatialize_tree(self): if len(self.S.root.children) > 1: # root is a speciation event root = TreeNode(label=0, event='S', color=self.S.root.label, dist=0.0, tstamp=self.S.root.tstamp) else: # planted species tree root = TreeNode(label=0, event=None, color=self.S.root.label, dist=0.0, tstamp=self.S.root.tstamp) T = Tree(root) self.id_counter += 1 self.spec_queue.popleft() if self._prohibit_extinction == 'per_species': rate = self.d + self.h else: rate = self.rate_sum for S_v in self.S.root.children: array_id = len(self.branches) new_branch = _Branch(self.id_counter, array_id, rate, T.root, self.S.root, S_v, 0) self.ES_to_b[(self.S.root, S_v)].append(new_branch) self.branches.append(new_branch) self.id_counter += 1 if self.S_subtree_survivors[S_v]: self.surv_non_loss_lineages.add(new_branch) self.total_rate += rate self.total_surviving = 1 return T
def parse_newick(newick): """Parses trees in Newick format into object of type 'Tree'. Parameters ---------- newick : str A tree in Newick format. Returns ------- Tree The parsed tree. Raises ------ TypeError If the input is not a string. ValueError If the input is not a valid Newick string. Notes ----- Do not use this function for serialization and reloading Tree objects. Use the `serialize()` function instead. Labels and colors that are integer numbers are converted to int. The colors (if present in <...> in the string) are parsed as strings and need to be converted to integers afterwards if necessary. """ # label<color>:distance label_col_dist_regex = re.compile( r"'?([a-zA-Z0-9_]*)'?<(.*)>:(-?[0-9]*\.?[0-9]*[Ee]?-?[0-9]+)") # label<color> label_col_regex = re.compile(r"'?([a-zA-Z0-9_]*)'?<(.*)>") # label:distance label_dist_regex = re.compile( r"'?([a-zA-Z0-9_]*)'?:(-?[0-9]*\.?[0-9]*[Ee]?-?[0-9]+)") def to_int(item): """Trys to convert the string into int.""" return int(item) if item.isdigit() else item def parse_subtree(subroot, subtree_string): """Recursive function to parse the subtrees.""" children = split_children(subtree_string) for child in children: node = TreeNode(event='') subroot.add_child(node) end = -1 if child[0] == '(': # the child has subtrees end = child.rfind(')') if end == -1: raise ValueError('invalid Newick string') parse_subtree(node, child[1:end]) # recursive call 'parse_subtree' child = child[end + 1:].strip() label_col_dist = label_col_dist_regex.match(child) if label_col_dist: # CASE 1: label<color>:distance node.label = to_int(label_col_dist.group(1)) node.color = to_int(label_col_dist.group(2)) node.dist = float(label_col_dist.group(3)) else: label_col = label_col_regex.match(child) label_dist = label_dist_regex.match(child) if label_col: # CASE 2: label<color> node.label = to_int(label_col.group(1)) node.color = to_int(label_col.group(2)) node.dist = 1.0 elif label_dist: # CASE 3: label:distance node.label = to_int(label_dist.group(1)) node.color = None node.dist = float(label_dist.group(2)) else: # CASE 4: label node.label = to_int(child) node.color = None node.dist = 1.0 # color is a tuple if node.color and isinstance(node.color, str) and node.color.find('-') != -1: split_color = node.color.split('-') node.color = (to_int(split_color[0]), to_int(split_color[1])) def split_children(child_string): """Splits a given string by all ',' that are not enclosed by parentheses.""" stack = 0 children = [] current = "" for c in child_string: if (stack == 0) and c == ',': children.append(current) current = "" elif c == '(': stack += 1 current += c elif c == ')': if stack <= 0: raise ValueError('invalid Newick string') stack -= 1 current += c else: current += c children.append(current.strip()) return children if not isinstance(newick, str): raise TypeError("Newick parser needs a 'str' as input") end = newick.find(";") if end != -1: newick = newick[:end] temp_root = TreeNode(event='') parse_subtree(temp_root, newick) if temp_root.children: root = temp_root.children[0] root.dist = 0.0 # set distance of the root to 0 root.detach() # remove the parent temp_root # (important for non-recursive to_newick2) return Tree(root) else: raise ValueError('invalid Newick string')
def random_colored_tree(N, colors, binary=False, force_all_colors=False): """Create a random colored tree. The number of leaves and the color labels are specified in the parameters `N` and `colors`, respectively. Each non-leaf node in the resulting tree will have at least children (property of phylogenetic trees). Parameters ---------- N : int The desired number of leaves. colors : int or list The list of colors, or the desired number of colors in which case the colors {1, ..., colors} are used. binary : bool, optional If True, forces the tree to be binary (the default is False). force_all_colors : bool If True, the resulting tree is guaranteed to have at least one leaf of each color (the default is False). Returns ------- Tree A random tree with `N` leaves and random leaf coloring. Raises ------ TypeError If `N` is not an integer > 0. ValueError If the number of colors is greater than `N` and `force_all_colors` is true. """ tree = Tree.random_tree(N, binary=binary) if isinstance(colors, int): colors = [i + 1 for i in range(colors)] elif not isinstance(colors, collections.abc.Iterable): raise TypeError("'colors' must be of type 'int' or iterable") if len(colors) > N and force_all_colors: raise ValueError('cannot force all colors since #colors > N') leaves = [l for l in tree.leaves()] if force_all_colors: # use every color at least once permutation = np.random.permutation(len(leaves)) for i in range(len(leaves)): if i < len(colors): leaves[permutation[i]].color = colors[i] else: # color the remaining leaves randomly leaves[permutation[i]].color = random.choice(colors) else: # assign colors completely randomly for leaf in leaves: leaf.color = random.choice(colors) return tree
def _EBDP_age_forward(age, episodes): """Episodic birth–death process (EBDP), forward algorithm with max. age.""" tree = Tree(TreeNode(label=0, event='S', dist=0.0, tstamp=0.0)) branches = [(1, tree.root)] forward_time = 0.0 node_counter = 1 i = 0 # current episode # may lead to extinction of the single branch at time t=0 _EBDP_mass_extinction(branches, episodes[i][2], episodes[i][3]) while forward_time < age: birth_rate, death_rate, *_ = episodes[i] rate = len(branches) * (birth_rate + death_rate) waiting_time = np.random.exponential( 1 / rate) if rate > 0.0 else float('inf') if i + 1 < len(episodes) and forward_time + waiting_time >= episodes[ i + 1][3]: _EBDP_mass_extinction(branches, episodes[i + 1][2], episodes[i + 1][3]) forward_time = episodes[i + 1][3] i += 1 elif forward_time + waiting_time >= age: break else: forward_time += waiting_time j = np.random.randint(len(branches)) branch_id, parent = branches[j] if birth_rate > np.random.uniform(low=0.0, high=birth_rate + death_rate): # speciation event drawn spec_node = TreeNode(label=branch_id, event='S', dist=forward_time - parent.tstamp, tstamp=forward_time) parent.add_child(spec_node) branches[j] = (node_counter, spec_node) branches.append((node_counter + 1, spec_node)) node_counter += 2 else: # extinction event drawn loss_node = TreeNode(label=branch_id, event='L', dist=forward_time - parent.tstamp, tstamp=forward_time) parent.add_child(loss_node) branches.pop(j) # finalize the (surviving) branches for branch_id, parent in branches: parent.add_child( TreeNode(label=branch_id, event='S', dist=age - parent.tstamp, tstamp=age)) # reverse such that t(root) = age and t(surviving leaves) = 0.0 for v in tree.preorder(): v.tstamp = age - v.tstamp return tree
def _EBDP_backward(N, episodes, max_tries=500): """Episodic birth–death process (EBDP), backward algorithm by Stadler 2001.""" birth_inv_sum = sum([1 / episodes[i][0] for i in range(len(episodes))]) for _ in range(max_tries): tree = None t = 0.0 i = 0 branches = [ TreeNode(label=j, event='S', dist=0.0, tstamp=t) for j in range(N) ] id_counter = N while branches: birth_i, death_i, rho_i, t_i = episodes[i] losses_to_add = round(len(branches) / rho_i) - len(branches) for j in range(losses_to_add): branches.append( TreeNode(label=id_counter, event='L', dist=0.0, tstamp=t)) id_counter += losses_to_add while branches: w = np.random.exponential( 1 / ((birth_i + death_i) * len(branches))) if i + 1 < len(episodes) and t + w > episodes[i + 1][3]: t = episodes[i + 1][3] i += 1 break else: t += w if birth_i > np.random.uniform(low=0.0, high=birth_i + death_i): # speciation event drawn spec_node = TreeNode(label=id_counter, event='S', dist=0.0, tstamp=t) id_counter += 1 if len(branches) > 1: k, l = np.random.choice(len(branches), 2, replace=False) if k > l: k, l = l, k spec_node.add_child(branches[k]) spec_node.add_child(branches[l]) branches[k] = spec_node branches.pop(l) else: spec_node.add_child(branches[0]) tree = Tree(spec_node) branches.clear() else: # extinction event drawn branches.append( TreeNode(label=id_counter, event='L', dist=0.0, tstamp=t)) id_counter += 1 # return tree with the following probability if np.random.random() < (1 / birth_i) / birth_inv_sum: for v in tree.preorder(): if v.parent: v.dist = v.parent.tstamp - v.tstamp return tree print("Could not return a tree after {} simulations!".format(max_tries), file=sys.stderr)
def simulate_species_tree(N, model='innovation', non_binary_prob=0.0, planted=True, remove_extinct=False, rescale_to_height=None, **kwargs): """Simulates a species tree S with N leaves. Keyword parameters: model -- simulation model to be applied; default is 'innovation' non_binary_prob -- probability that an inner edge is contracted; results in non-binary tree; default is 0.0 planted -- add a planted root that has the canonical root as its single neighbor; default is True remove_extinct -- remove all branches that lead to extinctions, only relevant for some models; default is False rescale_to_height -- determines the final distance from the root to the (surviving) leaves, default is None, i.e. model dependent """ # parameter checking if not isinstance(N, int) or N < 0: raise ValueError('N must be an int >=0') elif N == 0: return Tree(None) if not isinstance(model, str): raise ValueError("model must be of type 'str'") if non_binary_prob < 0.0 or non_binary_prob > 1.0: raise ValueError('contraction prob. must be in [0.0, 1.0]') if (rescale_to_height is not None and (not isinstance(rescale_to_height, (int, float)) or rescale_to_height < 0.0)): raise ValueError('height must be a number >=0') elif rescale_to_height is not None and N == 1 and not planted: raise ValueError('rescaling is not applicable to unplanted trees '\ 'with only one leaf') # choice of the main simulation algorithm if model.lower() in ('innovation', 'innovations'): tree = _innovation_model(N, planted) elif model.lower() == 'yule': tree = _yule(N, kwargs.get('birth_rate')) elif model.upper() == 'BDP': tree = _BDP(N, **kwargs) elif model.upper() == 'EBDP': tree = _EBDP(N, **kwargs) else: raise ValueError("model '{}' is not available".format(model)) # remove extinct branches for models that include losses if remove_extinct and model.upper() in ('BDP', 'EBDP'): delete_losses_and_contract(tree, inplace=True) # remove planted edge for models that are planted by construction if not planted and model.upper() in ('YULE', 'BDP', 'EBDP'): remove_planted_root(tree, inplace=True) # make tree non_binary by random contraction of edges if non_binary_prob > 0.0: edges = _select_edges_for_contraction(tree, non_binary_prob, exclude_planted_edge=True) tree.contract(edges) # rescale to specified height if rescale_to_height is not None: _rescale(tree, rescale_to_height, inplace=True) return tree
def _innovation_model(N, planted, ultrametric=True): """Builds a species tree S with N leaves with the innovation model. Keyword arguments: ultrametric - if True make tree ultrametric and rescale it to height 1.0, else all edges have length 1.0; default is True """ tree = Tree(TreeNode(label=0, event='S')) tree.number_of_species = N node_counter = 1 # planted tree (root is an implicit outgroup with outdegree = 1) if planted: root = TreeNode(label=1, event='S') tree.root.add_child(root) node_counter += 1 else: root = tree.root features = [0] # set of available features species = {(0, ): root} # extant species while len(species) < N: loss_candidates = set() # species for which loss of a feature for s in species.keys(): # can trigger a speciation for i in range(0, len(s)): if s[:i] + s[i + 1:] not in species: loss_candidates.add(s) if not loss_candidates: # INNOVATION EVENT s = random.choice(list(species)) new_feature = len(features) new_s = s + (new_feature, ) child1 = TreeNode(label=node_counter, event='S') species[s].add_child(child1) child2 = TreeNode(label=node_counter + 1, event='S') species[s].add_child(child2) node_counter += 2 species[s] = child1 species[new_s] = child2 features.append(new_feature) else: s = random.choice(list(loss_candidates)) if len(s) > 1: feature_index = random.randint(0, len(s) - 1) else: feature_index = 0 new_s = s[:feature_index] + s[feature_index + 1:] if new_s not in species: # LOSS EVENT child1 = TreeNode(label=node_counter, event='S') species[s].add_child(child1) child2 = TreeNode(label=node_counter + 1, event='S') species[s].add_child(child2) node_counter += 2 species[s] = child1 species[new_s] = child2 if ultrametric: simulate_timing(tree) distance_from_timing(tree) return tree