def write_tree( child_lists, name_map, rank_map, options, branch_length=1 ): # Uses Biopython, only load if making tree import Bio.Phylo from Bio.Phylo import BaseTree def _get_name( node_id ): if options.name_id: return node_id return name_map[node_id] nodes = {} root_node_id = child_lists["0"][0] nodes[root_node_id] = BaseTree.Clade( name=_get_name( root_node_id), branch_length=branch_length ) def recurse_children( parent_id ): if options.cluster is not None and rank_map[parent_id] == options.cluster: # Short circuit if we found our rank, prevents 'hanging' no ranks from being output # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db return if parent_id not in nodes: nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length ) for child_id in child_lists.get( parent_id, [] ): if options.cluster is None or ( rank_map[child_id] <= options.cluster ): if child_id not in nodes: nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length) nodes[parent_id].clades.append(nodes[child_id]) recurse_children( child_id ) recurse_children( root_node_id ) tree = BaseTree.Tree(root=nodes[root_node_id]) Bio.Phylo.write( [tree], options.output_tree, 'newick' )
def create_upgma_tree(matrix, is_distance=True): adj_map = matrix.create_adjacency_map() closest_pairs = create_closest_pairs(adj_map, is_distance=is_distance) clade_map = create_clade_map(adj_map) for i in range(matrix.size - 2): if is_distance: source, pair_edge = min(closest_pairs.items(), key=lambda x: x[1]) else: source, pair_edge = max(closest_pairs.items(), key=lambda x: x[1]) merge_closest_edge(adj_map, clade_map, closest_pairs, (source, pair_edge[0]), pair_edge[1], is_distance=is_distance) unmerged_clusters = list(clade_map.keys()) unmerged_clades = list(clade_map.values()) if len(unmerged_clusters) > 1: branch_length = adj_map[unmerged_clusters[0]][unmerged_clusters[1]] root = BaseTree.Clade(branch_length=branch_length, clades=unmerged_clades) else: root = unmerged_clades[0] root.matrix = matrix tree = BaseTree.Tree(root=root, rooted=False) return tree
def upgma(self, distance_matrix): """Construct and return an UPGMA tree. Constructs and returns an Unweighted Pair Group Method with Arithmetic mean (UPGMA) tree. :Parameters: distance_matrix : DistanceMatrix The distance matrix for tree construction. """ if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(dm) > 1: min_dist = dm[1, 0] # find minimum index for i in range(1, len(dm)): for j in range(0, i): if min_dist >= dm[i, j]: min_dist = dm[i, j] min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = min_dist * 1.0 / 2 - self._height_of(clade1) clade2.branch_length = min_dist * 1.0 / 2 - self._height_of(clade2) # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] inner_clade.branch_length = 0 return BaseTree.Tree(inner_clade)
def create_ntree(tree): ntree = BaseTree.Clade() for key in tree: el = tree[key] if len(el.values()) > 0: ntree.clades.append(create_ntree(el)) else: ntree.clades.append(BaseTree.Clade(name=list(key)[0])) return ntree
def fit(self): if self.dist_matrix is None: return (False) assert (self.dist_matrix.shape[0] == self.dist_matrix.shape[1]) assert (not any( [self.dist_matrix[i][i] for i in self.dist_matrix.index])) self.tree = None self._nodes = { n: BaseTree.Clade(None, str(n)) for n in self.dist_matrix.columns } self._d_matrix = self.dist_matrix while self._d_matrix.shape[0] > 2: self._update_q_matrix() raw_min, col_min = self._get_pos_min_from_q_matrix() range_raw, range_col = self._get_dist_for_neighborhood( raw_min, col_min) new_name = "{}{}".format(str(raw_min), str(col_min)) self._update_nodes(raw_min, col_min, range_raw, range_col, new_name) new_dist_matrix = self._get_new_dist_matrix(raw_min, col_min) dists_node = [ self._get_dist_for_nodes(raw_min, col_min, index) for index in new_dist_matrix.index ] new_item = pd.Series(dists_node, name=new_name, index=new_dist_matrix.index) new_dist_matrix = new_dist_matrix.append(new_item).T new_item = new_item.append( pd.Series(0, name=new_name, index=[new_name])) new_dist_matrix = new_dist_matrix.append(new_item) self._d_matrix = new_dist_matrix assert (len(self._nodes) == 2) name1 = self._d_matrix.index[0] name2 = self._d_matrix.index[1] node1 = self._nodes.pop(name1) node2 = self._nodes.pop(name2) node1.branch_length = self._d_matrix[name1][name2] node2.clades.append(node1) self.tree = BaseTree.Tree(node2, rooted=False) self._nodes = None self._q_matrix = None self._d_matrix = None return (True)
def _part(clades): """recursive function of adam consensus algorithm""" new_clade = None terms = clades[0].get_terminals() term_names = [term.name for term in terms] if len(terms) == 1 or len(terms) == 2: new_clade = clades[0] else: bitstrs = set([_BitString('1' * len(terms))]) for clade in clades: for child in clade.clades: bitstr = _clade_to_bitstr(child, term_names) to_remove = set() to_add = set() for bs in bitstrs: if bs == bitstr: continue elif bs.contains(bitstr): to_add.add(bitstr) to_add.add(bs ^ bitstr) to_remove.add(bs) elif bitstr.contains(bs): to_add.add(bs ^ bitstr) elif not bs.independent(bitstr): to_add.add(bs & bitstr) to_add.add(bs & bitstr ^ bitstr) to_add.add(bs & bitstr ^ bs) to_remove.add(bs) # bitstrs = bitstrs | to_add bitstrs ^= to_remove if to_add: for ta in sorted(to_add, key=lambda bs: bs.count('1')): independent = True for bs in bitstrs: if not ta.independent(bs): independent = False break if independent: bitstrs.add(ta) new_clade = BaseTree.Clade() for bitstr in sorted(bitstrs): indices = bitstr.index_one() if len(indices) == 1: new_clade.clades.append(terms[indices[0]]) elif len(indices) == 2: bifur_clade = BaseTree.Clade() bifur_clade.clades.append(terms[indices[0]]) bifur_clade.clades.append(terms[indices[1]]) new_clade.clades.append(bifur_clade) elif len(indices) > 2: part_names = [term_names[i] for i in indices] next_clades = [] for clade in clades: next_clades.append(_sub_clade(clade, part_names)) # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] new_clade.clades.append(_part(next_clades)) return new_clade
def strict_consensus(trees, mcmc=False): """Search strict consensus tree from multiple trees. :Parameters: trees : list list of trees to produce consensus tree or a list of tuples output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC) mcmc : Boolean True if parameter trees is a tuple, output of mcmc """ if mcmc: trees = [tree[0] for tree in trees] trees_iter = iter(trees) first_tree = next(trees_iter) terms = first_tree.get_terminals() bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], trees_iter)) # Store bitstrs for strict clades strict_bitstrs = [ bitstr for bitstr, t in bitstr_counts.items() if t[0] == tree_count ] strict_bitstrs.sort(key=lambda bitstr: bitstr.count("1"), reverse=True) # Create root root = BaseTree.Clade() if strict_bitstrs[0].count("1") == len(terms): root.clades.extend(terms) else: raise ValueError("Taxons in provided trees should be consistent") # make a bitstr to clades dict and store root clade bitstr_clades = {strict_bitstrs[0]: root} # create inner clades for bitstr in strict_bitstrs[1:]: clade_terms = [terms[i] for i in bitstr.index_one()] clade = BaseTree.Clade() clade.clades.extend(clade_terms) for bs, c in bitstr_clades.items(): # check if it should be the parent of current clade if bs.contains(bitstr): # remove old bitstring del bitstr_clades[bs] # update clade childs new_childs = [child for child in c.clades if child not in clade_terms] c.clades = new_childs # set current clade as child of c c.clades.append(clade) # update bitstring bs = bs ^ bitstr # update clade bitstr_clades[bs] = c break # put new clade bitstr_clades[bitstr] = clade return BaseTree.Tree(root=root)
def recurse_children( parent_id ): if options.cluster is not None and rank_map[parent_id] == options.cluster: # Short circuit if we found our rank, prevents 'hanging' no ranks from being output # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db return if parent_id not in nodes: nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length ) for child_id in child_lists.get( parent_id, [] ): if options.cluster is None or ( rank_map[child_id] <= options.cluster ): if child_id not in nodes: nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length) nodes[parent_id].clades.append(nodes[child_id]) recurse_children( child_id )
def prettyprint_tree(tree, file=None): # Convert the "tree" object (list of clades) to a BioPython tree # to take advantage of their output methods def create_ntree(tree): ntree = BaseTree.Clade() for key in tree: el = tree[key] if len(el.values()) > 0: ntree.clades.append(create_ntree(el)) else: ntree.clades.append(BaseTree.Clade(name=list(key)[0])) return ntree # Sort the clades from largest to smallest new_tree = sorted(tree, key=lambda x: -len(x)) # Build a dictionary representation of the tree tree_dict = {} for clade in new_tree: tree_dict = create_tree_dict(tree_dict, clade) # Convert the dictionary representation to a BioPython Tree object ntree = BaseTree.Tree(create_ntree(tree_dict)) # Use the BioPython print method Phylo.draw_ascii(ntree, file=file) try: Phylo.draw(ntree) except: pass return
def _update_nodes(self, n1, n2, d1, d2, new_n): tmp1 = self._nodes.pop(n1) tmp2 = self._nodes.pop(n2) tmp1.branch_length = float(d1) tmp2.branch_length = float(d2) self._nodes[new_n] = BaseTree.Clade(None, new_n, [tmp1, tmp2])
def strict_consensus(trees): """Search strict consensus tree from multiple trees. :Parameters: trees: list list of trees to produce consensus tree. """ terms = trees[0].get_terminals() bitstr_counts = _count_clades(trees) # Store bitstrs for strict clades strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items() if t[0] == len(trees)] strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True) # Create root root = BaseTree.Clade() if strict_bitstrs[0].count('1') == len(terms): root.clades.extend(terms) else: raise ValueError('Taxons in provided trees should be consistent') # make a bitstr to clades dict and store root clade bitstr_clades = {strict_bitstrs[0]: root} # create inner clades for bitstr in strict_bitstrs[1:]: clade_terms = [terms[i] for i in bitstr.index_one()] clade = BaseTree.Clade() clade.clades.extend(clade_terms) for bs, c in bitstr_clades.items(): # check if it should be the parent of current clade if bs.contains(bitstr): # remove old bitstring del bitstr_clades[bs] # update clade childs new_childs = [child for child in c.clades if child not in clade_terms] c.clades = new_childs # set current clade as child of c c.clades.append(clade) # update bitstring bs = bs ^ bitstr # update clade bitstr_clades[bs] = c break # put new clade bitstr_clades[bitstr] = clade return BaseTree.Tree(root=root)
def create_tree(self, root: ParsimonyClade): """ Create tree with given root. root: root returns: tree """ return BaseTree.Tree(root, rooted=True)
def create_clade_map(adj_map): clade_map = dict.fromkeys(adj_map.keys()) for cluster in adj_map.keys(): clade = BaseTree.Clade(name=str(cluster)) clade.matrix = None clade_map[cluster] = clade return clade_map
def adam_consensus(trees): """Search Adam Consensus tree from multiple trees :Parameters: trees : list list of trees to produce consensus tree. """ clades = [tree.root for tree in trees] return BaseTree.Tree(root=_part(clades), rooted=True)
def get_node_by_id(tree: Phylo.BaseTree, postorder_node_id: int) -> Phylo.BaseTree.Clade: """ Finds a tree node by its post-order DFS id. These IDs are used in .jplace formatted files. """ postorder_id = 0 for node in tree.find_elements(order='postorder'): if postorder_id == postorder_node_id: return node postorder_id += 1 raise RuntimeError(str(postorder_node_id) + " not found.")
def create_tree(self): """Methods that constructs upgma tree based on the distance matrix """ if hasattr(self, 'tree'): return self.tree if not self.distances: self.tree = None return None clades = [BaseTree.Clade(None, n) for n in self.distances.names] find_clade = lambda name: [i for i, el in enumerate(clades) if el.name == name][0] while len(self.distances.names) > 1: dist, i, j = _find_min(self.distances) i_clade, j_clade = find_clade(i), find_clade(j) new_clade = BaseTree.Clade(0, str(i) + str(j)) _recalc_height(clades[i_clade], dist) _recalc_height(clades[j_clade], dist) new_clade.clades.append(clades[i_clade]) new_clade.clades.append(clades[j_clade]) if j_clade > i_clade: i_clade, j_clade = j_clade, i_clade clades.pop(i_clade) clades.pop(j_clade) clades.append(new_clade) self.distances = _join_clades(i, j, self.distances) self.tree = BaseTree.Tree(clades[0]) return self.tree
def _sub_clade(clade, term_names): """Extract a compatible subclade that only contains the given terminal names (PRIVATE).""" term_clades = [clade.find_any(name) for name in term_names] sub_clade = clade.common_ancestor(term_clades) if len(term_names) != sub_clade.count_terminals(): temp_clade = BaseTree.Clade() temp_clade.clades.extend(term_clades) for c in sub_clade.find_clades(terminal=False, order="preorder"): if c == sub_clade.root: continue childs = set(c.find_clades(terminal=True)) & set(term_clades) if childs: for tc in temp_clade.find_clades(terminal=False, order="preorder"): tc_childs = set(tc.clades) tc_new_clades = tc_childs - childs if childs.issubset(tc_childs) and tc_new_clades: tc.clades = list(tc_new_clades) child_clade = BaseTree.Clade() child_clade.clades.extend(list(childs)) tc.clades.append(child_clade) sub_clade = temp_clade return sub_clade
def adam_consensus(trees, mcmc=False): """Search Adam Consensus tree from multiple trees. :Parameters: trees : list list of trees to produce consensus tree or a list of tuples output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC) mcmc : Boolean True if parameter trees is a tuple, output of mcmc """ if mcmc: trees = [tree[0] for tree in trees] clades = [tree.root for tree in trees] return BaseTree.Tree(root=_part(clades), rooted=True)
def merge_closest_edge(adj_map, clade_map, closest_pairs, closest_pair, pair_value, is_distance=True): source = closest_pair[0] merging = closest_pair[1] if closest_pair[1] < source: source = closest_pair[1] merging = closest_pair[0] source_dict = dict() for cluster, adj_dict in adj_map.items(): if cluster in closest_pair: continue adj_dict[source] = (adj_dict[source] + adj_dict[merging]) / 2 adj_dict.pop(merging) source_dict[cluster] = adj_dict[source] ccp = (cluster, closest_pairs[cluster][0]) if source in ccp or merging in ccp: if is_distance: closest_pairs[cluster] = min(adj_dict.items(), key=lambda x: x[1]) else: closest_pairs[cluster] = max(adj_dict.items(), key=lambda x: x[1]) adj_map.pop(merging) adj_map[source] = source_dict closest_pairs.pop(merging) if is_distance: closest_pairs[source] = min(source_dict.items(), key=lambda x: x[1]) else: closest_pairs[source] = max(source_dict.items(), key=lambda x: x[1]) merging_clade = clade_map.pop(merging) clade_map[source] = BaseTree.Clade( branch_length=pair_value, clades=[clade_map[source], merging_clade]) clade_map[source].matrix = None
def nj(self, distance_matrix): """Construct and return a Neighbor Joining tree. :Parameters: distance_matrix : DistanceMatrix The distance matrix for tree construction. """ if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init node distance node_dist = [0] * len(dm) # init minimum index min_i = 0 min_j = 0 inner_count = 0 # special cases for Minimum Alignment Matrices if len(dm) == 1: root = clades[0] return BaseTree.Tree(root, rooted=False) elif len(dm) == 2: # minimum distance will always be [1,0] min_i = 1 min_j = 0 clade1 = clades[min_i] clade2 = clades[min_j] clade1.branch_length = dm[min_i, min_j] / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length inner_clade = BaseTree.Clade(None, "Inner") inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) clades[0] = inner_clade root = clades[0] return BaseTree.Tree(root, rooted=False) while len(dm) > 2: # calculate nodeDist for i in range(0, len(dm)): node_dist[i] = 0 for j in range(0, len(dm)): node_dist[i] += dm[i, j] node_dist[i] = node_dist[i] / (len(dm) - 2) # find minimum distance pair min_dist = dm[1, 0] - node_dist[1] - node_dist[0] min_i = 0 min_j = 1 for i in range(1, len(dm)): for j in range(0, i): temp = dm[i, j] - node_dist[i] - node_dist[j] if min_dist > temp: min_dist = temp min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j]) / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j]) / 2.0 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] # set the last clade as one of the child of the inner_clade root = None if clades[0] == inner_clade: clades[0].branch_length = 0 clades[1].branch_length = dm[1, 0] clades[0].clades.append(clades[1]) root = clades[0] else: clades[0].branch_length = dm[1, 0] clades[1].branch_length = 0 clades[1].clades.append(clades[0]) root = clades[1] return BaseTree.Tree(root, rooted=False)
def main(self, tree_filename, tree_format='newick'): col_delimiter = '\t' url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip' # download the taxonomy archive filename = self.download_file(url) # extract the tables extract = 'taxon.txt' if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = zipfile.ZipFile(filename, mode='r') archive.extract(extract, path=self.data_dir) archive.close() # build BioPython clades print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file: for line in taxonomy_file: line = line.strip() values = line.split(col_delimiter) id, parent_id, syn_id, _, name, _, status = values[:7] # skip incertae sedis taxa if id == '0': continue if syn_id and not 'synonym' in status: continue elif syn_id and 'synonym' in status: if tree_format == 'cdao': nodes[id] = ('synonym', name, syn_id) elif not syn_id: nodes[id] = BaseTree.Clade(name=name) nodes[id].parent_id = parent_id print 'Found %s OTUs.' % len(nodes) nodes[''] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if not node_id: continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id except (KeyError, AttributeError): pass elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def majority_consensus(trees, cutoff=0): """Search majority rule consensus tree from multiple trees. This is a extend majority rule method, which means the you can set any cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to create a relaxed binary consensus tree in any condition (as long as one of the provided trees is a binary tree). The branch length of each consensus clade in the result consensus tree is the average length of all counts for that clade. :Parameters: trees : iterable iterable of trees to produce consensus tree. """ tree_iter = iter(trees) first_tree = next(tree_iter) terms = first_tree.get_terminals() bitstr_counts, tree_count = _count_clades( itertools.chain([first_tree], tree_iter)) # Sort bitstrs by descending #occurrences, then #tips, then tip order bitstrs = sorted( bitstr_counts.keys(), key=lambda bitstr: (bitstr_counts[bitstr][0], bitstr.count('1'), str(bitstr)), reverse=True) root = BaseTree.Clade() if bitstrs[0].count('1') == len(terms): root.clades.extend(terms) else: raise ValueError('Taxons in provided trees should be consistent') # Make a bitstr-to-clades dict and store root clade bitstr_clades = {bitstrs[0]: root} # create inner clades for bitstr in bitstrs[1:]: # apply majority rule count_in_trees, branch_length_sum = bitstr_counts[bitstr] confidence = 100.0 * count_in_trees / tree_count if confidence < cutoff * 100.0: break clade_terms = [terms[i] for i in bitstr.index_one()] clade = BaseTree.Clade() clade.clades.extend(clade_terms) clade.confidence = confidence clade.branch_length = branch_length_sum / count_in_trees bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), reverse=True) # check if current clade is compatible with previous clades and # record it's possible parent and child clades. compatible = True parent_bitstr = None child_bitstrs = [] # multiple independent childs for bs in bsckeys: if not bs.iscompatible(bitstr): compatible = False break # assign the closest ancestor as its parent # as bsckeys is sorted, it should be the last one if bs.contains(bitstr): parent_bitstr = bs # assign the closest descendant as its child # the largest and independent clades if (bitstr.contains(bs) and bs != bitstr and all(c.independent(bs) for c in child_bitstrs)): child_bitstrs.append(bs) if not compatible: continue if parent_bitstr: # insert current clade; remove old bitstring parent_clade = bitstr_clades.pop(parent_bitstr) # update parent clade childs parent_clade.clades = [ c for c in parent_clade.clades if c not in clade_terms ] # set current clade as child of parent_clade parent_clade.clades.append(clade) # update bitstring # parent = parent ^ bitstr # update clade bitstr_clades[parent_bitstr] = parent_clade if child_bitstrs: remove_list = [] for c in child_bitstrs: remove_list.extend(c.index_one()) child_clade = bitstr_clades[c] parent_clade.clades.remove(child_clade) clade.clades.append(child_clade) remove_terms = [terms[i] for i in remove_list] clade.clades = [c for c in clade.clades if c not in remove_terms] # put new clade bitstr_clades[bitstr] = clade if ((len(bitstr_clades) == len(terms) - 1) or (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): break return BaseTree.Tree(root=root)
def nj_full_gpu(self, distance_matrix): if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init node distance node_dist = [0] * len(dm) # init minimum index min_i = 0 min_j = 0 inner_count = 0 total_time = 0 total_time2 = 0 # special cases for Minimum Alignment Matrices if len(dm) == 1: root = clades[0] return BaseTree.Tree(root, rooted=False) elif len(dm) == 2: # minimum distance will always be [1,0] min_i = 1 min_j = 0 clade1 = clades[min_i] clade2 = clades[min_j] clade1.branch_length = dm[min_i, min_j] / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length inner_clade = BaseTree.Clade(None, "Inner") inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) clades[0] = inner_clade root = clades[0] return BaseTree.Tree(root, rooted=False) mod = SourceModule(""" #include <stdio.h> #include <stdlib.h> __global__ void DeviceNodeDist(double *device_dm, double *device_node_dist, int N) { const int tid = threadIdx.y + blockIdx.y* blockDim.y; if (tid >= N) return; for(int i = 0; i< N; i++){ if(tid< i){ device_node_dist[tid] += device_dm[(i*(i+1))/2 + tid]; }else{ device_node_dist[tid] += device_dm[(tid*(tid+1))/2 + i]; } } device_node_dist[tid]= (double)(device_node_dist[tid]/ (N-2)); }""") mod1 = SourceModule(""" __global__ void findMin(double *dm, double *node_dist, long long *index_x, long long *index_y, double *local_min, int c, int l, int dm_length) { int k = threadIdx.y + blockIdx.y*blockDim.y; double min_dist = 0.0; int min_x =0; int min_y =0; int x = 0; int y = 0; for(int i= k*c ; i< (k+1)*c; i++){ if(i<l) { for(int j=0; j<dm_length; j++){ if(i==0){ x=1; y=0; break; }else{ int t_val = ((j+1)*(j+2))/2 ; if(i < t_val){ x=j+1; y= i-(t_val-j-1); break; }else if(i== t_val){ x = j+2; y = 0; break; } } } double temp = dm[i] - (node_dist[x] + node_dist[y] ); if(min_dist > temp){ min_dist = temp; min_x = x; min_y = y; } } } local_min[k]=min_dist; index_x[k]= min_x; index_y[k]= min_y; }""") # print("Time taken to run SourceModule %s" % (time.time()-in_t1)) while len(dm) > 2: # calculate nodeDist host_dm = [] # 1D list for distance matrix for list in dm.matrix: host_dm.extend(list) host_dm = np.array(host_dm) # host_dm = host_dm.astype(np.float32) length = len(dm) host_node_dist = np.zeros((length,), dtype=float) # host_node_dist = host_node_dist.astype(np.float32) ###GPU code start = cuda.Event() end = cuda.Event() # get the optimum block size based on dataset size if (length < 128): BLOCKSIZE = 128 elif (length < 256): BLOCKSIZE = 256 elif (length < 512): BLOCKSIZE = 512 else: BLOCKSIZE = 1024 ###Allocate GPU device memory device_dm = cuda.mem_alloc(host_dm.nbytes) device_node_dist = cuda.mem_alloc(host_node_dist.nbytes) ###Memcopy from host to device cuda.memcpy_htod(device_dm, host_dm) DeviceNodeDist = mod.get_function("DeviceNodeDist") blockDim = (1, BLOCKSIZE, 1) gridDim = (1, length / BLOCKSIZE + 1, 1) start.record() DeviceNodeDist(device_dm, device_node_dist, np.int32(length), block=blockDim, grid=gridDim) end.record() end.synchronize() node_dist1 = np.empty_like(host_node_dist) cuda.memcpy_dtoh(node_dist1, device_node_dist) node_dist2 = node_dist1.tolist() node_dist[0:len(node_dist2)] = node_dist2 device_dm.free() in_t2 = time.time() start1 = cuda.Event() end1 = cuda.Event() mat = dm.matrix dm_cpu = np.array(mat[1][:-1]) for i in range(2, len(dm)): dm_cpu = np.append(dm_cpu, mat[i][:-1]) combinations = int(((len(dm) - 1) * len(dm)) / 2) if combinations < 1024 * 128: block_size = int(round((len(dm)) / 2)) else: block_size = 512 local_count = int(round(combinations / block_size)) index_x = np.zeros(block_size, dtype=int) index_y = np.zeros(block_size, dtype=int) min_val = np.zeros(block_size, dtype=float) local_min_array_gpu = cuda.mem_alloc(dm_cpu.nbytes) local_index_gpux = cuda.mem_alloc(index_x.nbytes) local_index_gpuy = cuda.mem_alloc(index_y.nbytes) local_min_gpu = cuda.mem_alloc(min_val.nbytes) cuda.memcpy_htod(local_min_array_gpu, dm_cpu) func = mod1.get_function("findMin") start1.record() func(local_min_array_gpu, device_node_dist, local_index_gpux, local_index_gpuy, local_min_gpu, np.int32(local_count), np.int32(len(dm_cpu)), np.int32(len(dm)), block=(1, block_size, 1)) end1.record() end1.synchronize() cuda.memcpy_dtoh(min_val, local_min_gpu) cuda.memcpy_dtoh(index_x, local_index_gpux) cuda.memcpy_dtoh(index_y, local_index_gpuy) min_val_new = min_val.tolist() local_min_array_gpu.free() local_min_gpu.free() local_index_gpux.free() local_index_gpuy.free() device_node_dist.free() min_dist = min(min_val_new) for i in range(len(min_val)): if min_dist == min_val[i]: min_i = index_x[i] min_j = index_y[i] break del host_dm del host_node_dist del dm_cpu total_time2 += time.time() - in_t2 # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j]) / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j]) / 2.0 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] #print("Time taken for min dist node calculation= %s" % total_time2) # set the last clade as one of the child of the inner_clade root = None if clades[0] == inner_clade: clades[0].branch_length = 0 clades[1].branch_length = dm[1, 0] clades[0].clades.append(clades[1]) root = clades[0] else: clades[0].branch_length = dm[1, 0] clades[1].branch_length = 0 clades[1].clades.append(clades[0]) root = clades[1] return BaseTree.Tree(root, rooted=False)
def main(self, tree_filename, tree_format='newick'): col_delimiter = '|' url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the tables for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') full_extract = [ x for x in archive.getnames() if x.split('/')[-1] == extract ][0] member = archive.getmember(full_extract) member.name = extract archive.extract(extract, path=self.data_dir) archive.close() # get names for all ITIS TSNs from longnames table print 'Getting names...' names = {} with open(os.path.join(self.data_dir, 'longnames')) as names_file: for line in names_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values names[tax_id] = name # read all node info from taxonomic_units print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file: for line in nodes_file: line = line.strip() values = line.split(col_delimiter) (tax_id, usage, parent_id, uncertain_parent) = [values[n] for n in (0, 10, 17, 23)] #if uncertain_parent: continue if not usage in ('accepted', 'valid'): continue name = names[tax_id] this_node = BaseTree.Clade(name=name) nodes[tax_id] = this_node this_node.parent_id = parent_id other_names = defaultdict(set) if tree_format == 'cdao': # get synonym definitions print 'Getting synonyms...' with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) node_id, syn_id, _ = values nodes[node_id] = ('synonym', names[node_id], syn_id) with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values[:2] other_names[tax_id].add(name) print 'Found %s OTUs.' % len(nodes) nodes['0'] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == '0': continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) except (KeyError, AttributeError): continue del this_node.parent_id if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for name in other_names[node_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def upgma(self, distance_matrix): # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) dm_count = copy.deepcopy(dm) for i in range(1, len(dm_count)): for j in range(0, i): dm_count[i, j] = 1 # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(dm) > 1: min_dist = dm[1, 0] # find minimum index mintime = time.time() for i in range(1, len(dm)): for j in range(0, i): if min_dist >= dm[i, j]: min_dist = dm[i, j] min_i = i min_j = j mintime2 = time.time() self.gap += mintime2 - mintime # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length if clade1.is_terminal(): clade1.branch_length = min_dist * 1.0 / 2 else: clade1.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade1) if clade2.is_terminal(): clade2.branch_length = min_dist * 1.0 / 2 else: clade2.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade2) # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): r = 0 if k != min_i and k != min_j: r = dm_count[min_i, k] + dm_count[min_j, k] dm[min_j, k] = ((dm[min_i, k] * dm_count[min_i, k]) + (dm[min_j, k] * dm_count[min_j, k])) / r dm_count[min_j, k] = r dm_count.names[min_j] = "Inner" + str(inner_count) del dm_count[min_i] dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] inner_clade.branch_length = 0 return BaseTree.Tree(inner_clade)
def create_tree(self, names, matrix): if not names or not matrix: return self.tree distance_matrix = DistanceMatrix(names, matrix) dm = copy.deepcopy(distance_matrix) clades = [BaseTree.Clade(None, name) for name in dm.names] # clades[0].clades.append(clades[2]) while len(clades) != 1: q_matrix = [] q_names = dm.names for ind_i, i in enumerate(dm.matrix): tmp = [] for ind_j, j in enumerate(i): if ind_i == ind_j: tmp.append(0) continue tmp.append((len(dm) - 2) * j - sum(dm[ind_i]) - sum(dm[ind_j])) q_matrix.append(tmp) q_matrix = DistanceMatrix(q_names, q_matrix) min_i = float('Inf') min_j = float('Inf') q_min = float('Inf') for ind_i, i in enumerate(q_matrix): for ind_j, j in enumerate(i): if j < q_min: q_min = j min_i = ind_i min_j = ind_j if len(clades) == 2: # print('c:', clade_j) if min_i == 0: clade_j = clades[min_j] clade_j.branch_length = dm[min_i][min_j] clades[min_i].clades.append(clade_j) del clades[min_j] break if min_i == 1: clade_i = clades[min_i] clade_i.branch_length = dm[min_i][min_j] clades[min_j].clades.append(clade_i) del clades[min_i] break dist_i = 0.5 * dm[min_i][min_j] + ( sum(dm[min_i]) - sum(dm[min_j])) / (2 * (len(dm) - 2)) dist_j = dm[min_i][min_j] - dist_i clade_i = clades[min_i] clade_j = clades[min_j] clade_i.branch_length = dist_i clade_j.branch_length = dist_j tmp_clade = BaseTree.Clade(None, dm.names[min_i] + dm.names[min_j]) tmp_clade.clades.append(clade_i) tmp_clade.clades.append(clade_j) clades[min_j] = tmp_clade del clades[min_i] # print(clades) tmp_dist = [] for k in range(len(dm)): if k == min_j or k == 0: tmp_dist.append(0) continue tmp_dist.append( 0.5 * (dm[min_i][k] + dm[min_j][k] - dm[min_i][min_j])) dm[min_j] = tmp_dist dm.names[min_j] = dm.names[min_i] + dm.names[min_j] del dm[min_i] self.tree = BaseTree.Tree(clades[0], rooted=False) # print('res: ', BaseTree.Tree(clades[0], rooted = False)) return self.tree
def full_gpu_upgma(self, distance_matrix): # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) dm_count = copy.deepcopy(dm) for i in range(1, len(dm_count)): for j in range(0, i): dm_count[i, j] = 1 # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init minimum index min_i = 0 min_j = 0 inner_count = 0 # GPU kernel to find the minimum index and minimum distance mod = SourceModule(""" __global__ void findMin(double *dm, long long *index, double *local_min, int c, int l) { int k = threadIdx.y + blockIdx.y*blockDim.y; double min_dist = dm[k*c]; int id = 0 ; for(int i= k*c ; i< (k+1)*c; i++){ if(i<l) { if(min_dist >= dm[i]) { min_dist = dm[i]; id = i; } } } local_min[k]=min_dist; index[k]= id; }""") while len(dm) > 1: # host array creation time_gpu_start = time.time() mat = dm.matrix dm_cpu = np.array(mat[1][:-1]) for i in range(2, len(dm)): dm_cpu = np.append(dm_cpu, mat[i][:-1]) combinations = int(((len(dm) - 1) * len(dm)) / 2) if combinations < 1024 * 256: block_size = int(round((len(dm)) / 2)) elif combinations < 1024 * 1024: block_size = 512 else: block_size = 1024 local_count = int(round(combinations / block_size)) if local_count < 1024: grid_size = 1 else: grid_size = int(round(local_count / 1024)) + 1 index = np.zeros(block_size, dtype=int) min_val = np.zeros(block_size, dtype=float) local_min_array_gpu = drv.mem_alloc(dm_cpu.nbytes) local_index_gpu = drv.mem_alloc(index.nbytes) local_min_gpu = drv.mem_alloc(min_val.nbytes) drv.memcpy_htod(local_min_array_gpu, dm_cpu) drv.memcpy_htod(local_index_gpu, index) drv.memcpy_htod(local_min_gpu, min_val) func = mod.get_function("findMin") # start.record() func(local_min_array_gpu, local_index_gpu, local_min_gpu, np.int32(local_count), np.int32(len(dm_cpu)), block=(1, block_size, 1), grid=(1, grid_size, 1)) # end.record() # end.synchronize() drv.memcpy_dtoh(min_val, local_min_gpu) drv.memcpy_dtoh(index, local_index_gpu) min_val_new = min_val min_val = min_val.tolist() local_min_gpu.free() local_index_gpu.free() min_dist = min(min_val) global_id = 0 for i in range(len(min_val_new)): if min_dist == min_val_new[i]: global_id = index[i] break for i in range(1, len(distance_matrix)): if global_id == 0: min_i = 1 min_j = 0 break else: t_val = ((i + 1) * (i + 2)) / 2 if global_id < t_val: min_i = i + 1 min_j = global_id - (t_val - i - 1) break elif global_id == t_val: min_i = i + 2 min_j = 0 break # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length if clade1.is_terminal(): clade1.branch_length = min_dist * 1.0 / 2 else: clade1.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade1) if clade2.is_terminal(): clade2.branch_length = min_dist * 1.0 / 2 else: clade2.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade2) # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): r = 0 if k != min_i and k != min_j: r = dm_count[min_i, k] + dm_count[min_j, k] dm[min_j, k] = ((dm[min_i, k] * dm_count[min_i, k]) + (dm[min_j, k] * dm_count[min_j, k])) / r dm_count[min_j, k] = r dm_count.names[min_j] = "Inner" + str(inner_count) del dm_count[min_i] dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] inner_clade.branch_length = 0 del dm_cpu return BaseTree.Tree(inner_clade)
def main(self, tree_filename, tree_format='newick', ids=None): col_delimiter = '\t|\t' row_delimiter = '\t|\n' url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the text dump for extract in ('nodes.dmp', 'names.dmp'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') archive.extract(extract, path=self.data_dir) archive.close() # get names for all tax_ids from names.dmp print 'Getting names...' scientific_names = {} other_names = defaultdict(set) with open(os.path.join(self.data_dir, 'names.dmp')) as names_file: for line in names_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, name_txt, _, name_type = values[:4] if name_type == 'scientific name': scientific_names[tax_id] = name_txt else: other_names[tax_id].add(name_txt) # read all node info from nodes.dmp print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file: for line in nodes_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, parent_id = values[:2] if ids: this_node = BaseTree.Clade(name=tax_id) else: this_node = BaseTree.Clade(name=scientific_names[tax_id]) nodes[tax_id] = this_node this_node.parent_id = parent_id if tree_format == 'cdao': # add common names, synonyms, mispellings, etc. as skos:altLabels if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for x in other_names[tax_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x))) print 'Found %s OTUs.' % len(nodes) # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == this_node.parent_id: root_node = this_node print 'Found root.' else: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'
def makeNj(score, otuName): for i in range(len(score)): for j in range(len(score)): score[i][j] = round(score[i][j], 6) clades = [BaseTree.Clade(None, name) for name in otuName] # init node distance node_dist = [0] * len(score) # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(score) > 2: # calculate nodeDist for i in range(0, len(score)): node_dist[i] = 0 for j in range(0, len(score)): node_dist[i] += score[i][j] node_dist[i] = node_dist[i] / (len(score) - 2) # find minimum distance pair min_dist = score[1][0] - node_dist[1] - node_dist[0] min_i = 0 min_j = 1 for i in range(1, len(score)): for j in range(0, i): temp = score[i][j] - node_dist[i] - node_dist[j] if min_dist > temp: min_dist = temp min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = (score[min_i][min_j] + node_dist[min_i] - node_dist[min_j]) / 2.0 clade2.branch_length = score[min_i][min_j] - clade1.branch_length # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(score)): if k != min_i and k != min_j: score[min_j][k] = (score[min_i][k] + score[min_j][k] - score[min_i][min_j]) / 2.0 score[k][min_j] = score[min_j][k] otuName[min_j] = "Inner" + str(inner_count) del score[min_i] for i in range(len(score)): del score[i][min_i] # set the last clade as one of the child of the inner_clade root = None if clades[0] == inner_clade: clades[0].branch_length = 0 clades[1].branch_length = score[1][0] clades[0].clades.append(clades[1]) root = clades[0] else: clades[0].branch_length = score[1][0] clades[1].branch_length = 0 clades[1].clades.append(clades[0]) root = clades[1] return BaseTree.Tree(root, rooted=False)
def makeUpgma(score, otuName): for i in range(len(score)): for j in range(len(score)): score[i][j] = round(score[i][j], 6) clades = [BaseTree.Clade(None, name) for name in otuName] # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(score) > 1: min_dist = score[1][0] # find minimum index for i in range(1, len(score)): for j in range(0, i): if min_dist >= score[i][j]: min_dist = score[i][j] min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length # TODO: originally self._height_of function from github repo # was called but not implemented in this code. # Function was input above. if clade1.is_terminal(): clade1.branch_length = min_dist * 1.0 / 2 else: clade1.branch_length = min_dist * \ 1.0 / 2 - height_of(clade1) if clade2.is_terminal(): clade2.branch_length = min_dist * 1.0 / 2 else: clade2.branch_length = min_dist * \ 1.0 / 2 - height_of(clade2) ################################################################ ################################################################ ################################################################ # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(score)): if k != min_i and k != min_j: score[min_j][k] = (score[min_i][k] + score[min_j][k]) * 1.0 / 2 score[k][min_j] = score[min_j][k] otuName[min_j] = "Inner" + str(inner_count) del score[min_i] for i in range(len(score)): del score[i][min_i] inner_clade.branch_length = 0 return BaseTree.Tree(inner_clade)