def insert(scores, threshold, length_f=None): """Insert nodes that pass the threshold Parameters ---------- scores : dict The result of best() threshold : float A minimum scoring threshold which must be exceeded for a node to be inserted length_f : function, optional A function which can provide a branch length to set. This function must conform to the following signature: f(TreeNode, str, float) -> float Where TreeNode is the node to insert at, str is the name of the node being inserted, and float is the score of the query at the node. The default is to set a branch length of 0.0 """ if length_f is None: def length_f(a, b, c): return 0.0 for query, result in scores.items(): if result['score'] >= threshold: length = length_f(result['node'], query, result['score']) result['node'].append(skbio.TreeNode(name=query, length=length))
def recursive_subdivide(node, k, count, id_1, tree_node): count += 1 id_1 += str(count) bin_id.append("H" + str(count)) samples["H" + str(count)] = "" if len(node.points)<=k: return print(type(node.width)) w_ = node.width/2 h_ = node.height/2 #fun challenge do it in a for loop #probably make it smaller p = contains(node.x0, node.y0, w_, h_, node.points) x1 = Node(node.x0, node.y0, w_, h_, p, id_1 + "sw;") node_1 = skbio.TreeNode(name=str(count) + "sw") for pt in p: bin_1.append((pt.sample_id, count, x1.get_id())) node_1.extend([skbio.TreeNode(name=pt.sample_id)]) recursive_subdivide(x1, k, count, id_1+"sw;", node_1) p = contains(node.x0, node.y0+h_, w_, h_, node.points) x2 = Node(node.x0, node.y0+h_, w_, h_, p, id_1+"nw;") node_2 = skbio.TreeNode(name=str(count)+"nw") for pt in p: bin_1.append((pt.sample_id, count, x2.get_id())) node_2.extend([skbio.TreeNode(name=pt.sample_id)]) recursive_subdivide(x2, k, count, id_1+"nw;", node_2) p = contains(node.x0+w_, node.y0, w_, h_, node.points) x3 = Node(node.x0 + w_, node.y0, w_, h_, p, id_1+"se;") node_3 = skbio.TreeNode(name=str(count)+"se") for pt in p: bin_1.append((pt.sample_id, count, x3.get_id())) node_3.extend([skbio.TreeNode(name=pt.sample_id)]) recursive_subdivide(x3, k, count, id_1+"se;", node_3) p = contains(node.x0+w_, node.y0+h_, w_, h_, node.points) x4 = Node(node.x0+w_, node.y0+h_, w_, h_, p, id_1+"ne;") node_4 = skbio.TreeNode(name=str(count)+"ne") for pt in p: bin_1.append((pt.sample_id, count, x4.get_id())) node_4.extend([skbio.TreeNode(name=pt.sample_id)]) recursive_subdivide(x4, k, count, id_1+"ne;", node_4) tree_node.extend([node_1, node_2, node_3, node_4]) node.children = [x1, x2, x3, x4]
def _0(ff: TSVTaxonomyFormat) -> skbio.TreeNode: root = skbio.TreeNode('root', length=0) with ff.open() as fh: reader = iter(csv.reader(fh, delimiter='\t')) next(reader) # skip header for row in reader: id_, taxonomy = row[:2] taxonomy = taxonomy.split(';') node = root for taxon in taxonomy: for child in node.children: if child.name == taxon: node = child break else: child = skbio.TreeNode(taxon, length=1) node.append(child) node = child node.append(skbio.TreeNode(id_, length=1)) return root
def subdivide(self, count): samples = pd.DataFrame() samples['index'] = df['index'] samples = samples.set_index('index') count = 0 id_1 = "" base = skbio.TreeNode(name="root") recursive_subdivide(self.root, self.threshold, count, id_1, base) return base
def tree_to_matrix(tree, label, with_repr=False): # to do here : given a skbio tree and the beta-labels in a given order, # return the matrix A and the new labels corresponding dicti = dict() d = len(label) LEAVES = [tip.name for tip in tree.tips()] order = [] # list that will give the order in which the nodes are added # in the dicti, such that it will be easy to remove similar nodes for i, name_leaf in enumerate(label): name_leaf = label[i] dicti[name_leaf] = np.zeros(d, dtype=bool) dicti[name_leaf][i] = True order.append(name_leaf) if name_leaf not in LEAVES: tree.append(skbio.TreeNode(name=name_leaf) ) # add the node if it is node already in the tree print("The feature {} i not in the leaves of the tree".format( name_leaf)) for n in tree.find(name_leaf).ancestors(): ancest = n.name if ancest[-1] != "_": if ancest not in dicti: dicti[ancest] = np.zeros(d, dtype=bool) order.append(ancest) dicti[ancest][i] = True L, label2 = [], [] for node in tree.levelorder(): nam = node.name if nam in dicti and nam not in label2: label2.append(nam) L.append(dicti[nam]) to_keep = np.ones(len(L), dtype=bool) to_keep = remove_same_vect(L, label2, order) L = np.array(L) to_keep[np.all(L, axis=1)] = False return L[to_keep].T, np.array(label2)[to_keep]
def generate_new_phylogeny(phylogeny_fp, pivot_mapping_dct): tree = average_branch_lengths(phylogeny_fp) observed_internal_nodes = set() for idx in pivot_mapping_dct.keys(): if idx == -1: continue else: node_lca = tree.lca(pivot_mapping_dct[idx]) if len(list(node_lca.tips())) != len(pivot_mapping_dct[idx]): raise ValueError("The LCA has different tips than expected", len(list(node_lca.tips())), len(pivot_mapping_dct[idx])) tmp_node = skbio.TreeNode('cluster' + str(idx)) tmp_node.length = node_lca.avg parent_node = node_lca.parent if node_lca.dummy_name in observed_internal_nodes: raise ValueError("node_lca was observed before!", node_lca.dummy_name, node_lca, idx) else: observed_internal_nodes.add(node_lca.dummy_name) parent_node.append(tmp_node) parent_node.remove(node_lca) return tree
def majority_consensus(trees, cutoff=0.5): """Compute the majority consensus tree. When a taxon is designated as an outgroup, this algorithm is equivalent to tree popping described in The Mathematics of Phylogenetics. Since a clade is the subset of a split that does not contain the root, all sets of terminal labels obtained via the tips method when traversing the tree correspond to the half of the split without the root. Thus operating on the clades is implicitly operating on the equivalent split. The overall approach of the algorithm is to first tally the clades and then to progressively build the consensus tree by applying the clades in decreasing order of frequency (as long as they are compatible with all clades already displayed on the tree). The proof presented in The Mathematics of Phylogenetics is considerably more complex than the more intuitive algorithm used here. Whereas that proof required finding the minimum spanning tree, this approach instead uses the idea that as long as clades are applied in order of size (which is guaranteed by the principle that a parent clade will appear at least as many times in counts as its subclades and subsequent sorting on counts then size), then the right node onto which to apply a clade is always the smallest one that is compatible. This actually equivalent to finding the minimum spanning tree of the nodes colored by each clade (blue for those in the clade and red for those not in the clade). The tree induced by a node whose terminals are a superset of the clade clearly spans that clade. Thus, by iterating through all the nodes in the consensus tree sorted by decreasing size, the last node that meets this criterion induces the minimal spanning tree (MST) for the blue nodes (those within the clade). The MST for the red nodes (those not within the clade and containing the root) intersects with this MST at a single node via the tree popping theorem. By maintaining the directionality of the parent-child relationships away from the root, we know this is the node we have already identified. To see why, assume another node in the blue MST is the node of intersection. The parent of this node must also be blue because otherwise the MST is not minimal.The parent must also lie on the red MST since by construction following the chain of parents creates a path to the root and this path to the root must be on the red MST as trees have no cycles. Thus, we have a contradiction of the blue and red MSTs only intersecting at a single node, so the intersection node must be the previously identified node. Parameters ---------- trees: list of skbio TreeNodes Input trees from which to compute the consensus. cutoff: float The minimum fraction of trees a clade must be displayed on in order (not including the value itself) to contribute to the consensus. The default value of 0.5 ensures all incorporated clades are compatible. If the cutoff is set lower, a clade is only added to the consensus if it compatible with all clades already displayed. Clades are added in order of decreasing number of counts in the input trees. Returns ------- root: TreeNode """ # Count nodes and record tip names counts, tip_names = {}, set() for tree in trees: tip_names.update([tip.name for tip in tree.tips()]) for node in tree.traverse(): if not node.is_tip(): tip_set = frozenset([tip.name for tip in node.tips()]) counts[tip_set] = counts.get(tip_set, 0) + 1 counts = sorted(counts.items(), key=lambda x: (x[1], len(x[0])), reverse=True) # Sort by count then size # Make count list and node dictionary # Initialize consensus nodes with a tip for each unique taxon label and a root node containing all these tips # (The tips are instantiated first, and then the root node containing them is created.) consensus_nodes = { frozenset([name]): skbio.TreeNode(name=name) for name in tip_names } root = skbio.TreeNode(children=list(consensus_nodes.values())) consensus_nodes[frozenset([node.name for node in root.children])] = root # Add nodes for tip_set1, count in counts[1:]: if count / len(trees) < cutoff: break # Find parent for tip_set2 in sorted(consensus_nodes, key=lambda x: len(x), reverse=True): compatible = (tip_set1 <= tip_set2 or tip_set2 <= tip_set1 or not (tip_set1 & tip_set2)) if not compatible: break # Break search over parents if tip_set2 >= tip_set1: parent_set = tip_set2 # Smallest superset since sorted by size if not compatible: continue # Each clade should be compatible with all others in consensus tree parent_node = consensus_nodes[parent_set] # Construct current node children = [] for node in parent_node.children: for tip in node.tips(include_self=True): if tip.name in tip_set1: children.append(node) break # Compatibility requires all tips of a subnode are included if one is node = skbio.TreeNode( children=children, support=count / len(trees)) # Instantiation automatically updates parents consensus_nodes[tip_set1] = node parent_node.append(node) # Get branch lengths bl_sums = {} for tree in trees: for node in tree.traverse(): if node.is_tip(): continue tip_set = frozenset([tip.name for tip in node.tips()]) # Add clade if tip_set in consensus_nodes: count, bl_sum = bl_sums.get(tip_set, (0, 0)) count, bl_sum = count + 1, bl_sum + node.length if node.length else None bl_sums[tip_set] = (count, bl_sum) # Add terminal children of clade # The lengths of terminal children are only included if all the non-terminal children are clades # displayed in the consensus tree. This is expressed by checking this condition in a for loop and # proceeding to the else clause only if the loop isn't broken for child_node in filter(lambda x: not x.is_tip(), node.children): if not frozenset([tip.name for tip in child_node.tips() ]) in consensus_nodes: break else: for child_tip in filter(lambda x: x.is_tip(), node.children): tip_set = frozenset([child_tip.name]) count, bl_sum = bl_sums.get(tip_set, (0, 0)) count, bl_sum = count + 1, bl_sum + child_tip.length if child_tip.length else None bl_sums[tip_set] = (count, bl_sum) # Set branch lengths for tip_set, (count, bl_sum) in bl_sums.items(): node = consensus_nodes[tip_set] node.length = bl_sum / count if bl_sum else None return root
def build_tree_recurse(gene_tree, path): loss_nodes = [] current_tree = gene_tree path_splited = path.split('_') _id = '_' + path.split('_')[-1] if len(path_splited) > 1 else '' parent_name_splited = gene_tree.name.split('_') _parent_id = '_' + parent_name_splited[-1] if len( parent_name_splited) > 1 else '' for i in range(len(parent_name_splited)): if ('l' in parent_name_splited[i] and len(parent_name_splited[i]) == 2): _parent_id = '_' + parent_name_splited[i - 1] if i > 1 else '' break if (_id): subtree_path = os.path.join(path, 'gene_tree.txt') f = open(subtree_path) subtree = skbio.read(f, format='newick', into=skbio.tree.TreeNode) f.close() current_tree = subtree event_path = os.path.join(path, 'event.txt') f = open(event_path) line = f.readline() splited = line.strip().split(',') node_name = splited[0] distance = float(splited[1]) event_name = '_' + splited[2][0] event_index = '_' + splited[3] new_dt_node = skbio.TreeNode() child = None for node in gene_tree.traverse(): if node.name == (node_name + _parent_id): child = node break elif (node.name): if '_dl' in node.name: if node.name.split('_dl')[0] == (node_name + _parent_id): child = node break elif '_tl' in node.name: if node.name.split('_tl')[0] == (node_name + _parent_id): child = node break elif '_il' in node.name: if node.name.split('_il')[0] == (node_name + _parent_id): child = node break elif '_sl' in node.name: if node.name.split('_sl')[0] == (node_name + _parent_id): child = node break parent = child.parent new_dt_node.name = node_name + event_name + _id new_dt_node.length = child.length - distance new_dt_node.parent = parent new_dt_node.children.append(child) child.length = distance child.parent = new_dt_node for i in range(len(parent.children)): if (parent.children[i].name == child.name): del parent.children[i] break parent.children.append(new_dt_node) new_dt_node.children.append(subtree) subtree.parent = new_dt_node for node in subtree.traverse(): node.name = node.name + _id files = os.listdir(path) files_end_with_digit = [] for f in files: if f.split('_')[-1].isdigit(): files_end_with_digit.append(f) files = files_end_with_digit files = sorted(files, key=lambda x: int(x.split('_')[-1])) for f in files: file_path = os.path.join(path, f) if os.path.isdir(file_path): loss_nodes += build_tree_recurse(current_tree, file_path) files = os.listdir(path) for f in files: file_name = f.split('_') if (file_name and file_name[0] == 'ils'): _index = '_' + file_name[1] ils_path = os.path.join(path, f) file_ = open(ils_path) line = file_.readline() splited = line.split(',') node_name = splited[0] for node in current_tree.traverse(): if node.name == (node_name + _id): node.name = node_name + '_i' + _index + '_id' + _id break file_.close() for f in files: file_name = f.split('_') if (file_name and file_name[0] == 's'): _index = '_' + file_name[1] s_path = os.path.join(path, f) file_ = open(s_path) line = file_.readline() splited = line.split(',') node_name = splited[0] for node in current_tree.traverse(): if node.name == (node_name + _id): node.name = node_name + '_s' + _index + '_id' + _id break file_.close() for f in files: file_name = f.split('_') if (file_name and file_name[0] == 'loss'): loss_path = os.path.join(path, f) file_ = open(loss_path) line = file_.readline() splited = line.split(',') node_l_name = splited[0] node_l_distance = float(splited[1]) node_index = int(splited[2]) new_l_node = skbio.TreeNode() child = None print('start') for node in current_tree.traverse(): if (node.name): print('n=', node.name) splited = node.name.split('_') if (_id == ''): if (splited[0] == node_l_name): child = node break else: if node.name == (node_l_name + _id): child = node break elif (splited[0] == node_l_name and ('_' + splited[-1]) == _id): child = node break elif '_dl' in node.name: if node.name.split('_dl')[0] == (node_l_name + _id): child = node break else: splited = node.name.split('_dl')[0].split('_') elif '_tl' in node.name: if node.name.split('_tl')[0] == (node_l_name + _id): child = node break else: splited = node.name.split('_tl')[0].split('_') elif '_il' in node.name: if node.name.split('_il')[0] == (node_l_name + _id): child = node break else: splited = node.name.split('_il')[0].split('_') elif '_sl' in node.name: if node.name.split('_sl')[0] == (node_l_name + _id): child = node break else: splited = node.name.split('_sl')[0].split('_') if (splited[0] == node_l_name and ('_' + splited[-1]) == _id): child = node break print(node_l_name + _id) print('end') parent = child.parent if '_s_' in parent.name: new_l_node.name = node_l_name + '_l' + '_id' + _id if (parent.children[0].name == child.name): parent.children[ 1].name += '_sl' + '_ind_' + parent.name.split( '_s_')[1].split('_')[0] + '_' + str(node_index) else: parent.children[ 0].name += '_sl' + '_ind_' + parent.name.split( '_s_')[1].split('_')[0] + '_' + str(node_index) elif '_d_' in parent.name: new_l_node.name = node_l_name + '_l' + '_id' + _id if (parent.children[0].name == child.name): parent.children[ 1].name += '_dl' + '_ind_' + parent.name.split( '_d_')[1].split('_')[0] + '_' + str(node_index) else: parent.children[ 0].name += '_dl' + '_ind_' + parent.name.split( '_d_')[1].split('_')[0] + '_' + str(node_index) elif '_t_' in parent.name: new_l_node.name = node_l_name + '_l' + '_id' + _id if (parent.children[0].name == child.name): parent.children[ 1].name += '_tl' + '_ind_' + parent.name.split( '_t_')[1].split('_')[0] + '_' + str(node_index) else: parent.children[ 0].name += '_tl' + '_ind_' + parent.name.split( '_t_')[1].split('_')[0] + '_' + str(node_index) elif '_i_' in parent.name: new_l_node.name = node_l_name + '_l' + '_id' + _id if (parent.children[0].name == child.name): parent.children[ 1].name += '_il' + '_ind_' + parent.name.split( '_i_')[1].split('_')[0] + '_' + str(node_index) else: parent.children[ 0].name += '_il' + '_ind_' + parent.name.split( '_i_')[1].split('_')[0] + '_' + str(node_index) new_l_node.length = child.length - node_l_distance new_l_node.parent = parent new_l_node.children.append(child) child.length = node_l_distance child.parent = new_l_node for i in range(len(parent.children)): if (parent.children[i].name == child.name): del parent.children[i] break parent.children.append(new_l_node) loss_nodes.append(new_l_node) file_.close() return loss_nodes