class TrackedItem(object): def __init__(self): self.name = '' self.parent = None self.data = DataStore(float) self.leaf = False self.node = Tree() @property def root(self): return self.parent.root if self.parent else self def update_stats(self, name, parent, data, sf): self.data.merge(data) self.name = self.node.name = name self.node.item = self if parent and self.node not in parent.node.children: self.parent = parent parent.node.add_child(self.node) self.node.add_feature("weight", self.data[sf]) for key in self.data: self.node.add_feature(key, self.data[key]) def __str__(self): return "%s: %s" % (self.name, ','.join(["%d %s" % (self.data[key], key) for key in self.data]))
def getGenera(taxonomy_queryset, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *species* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet only_id : Boolean (flag) True (default False) means that is going to append the full name of the genera. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :genera_tree: derived from ete2.TreeNode() """ tax = taxonomy_queryset sps = tax.species genera = tax.genera family_tree = Tree(name='genus_root') for genus in genera: family_id = genus['parent_id'] genus_id = genus['genus_id'] if not only_id: name = genus['name'] else: name = genus_id ab = genus['ab'] points = genus['points'] sp_by_gns = sps.filter(genus_id__exact=genus_id) gn_t = Tree(name=name, support=ab) gn_t.add_feature('genus_id', genus_id) gn_t.add_feature('level', 'genus') gn_t.add_feature('points', points) #logger.info('Building branch for genus %s' %name) for specie in sp_by_gns: if not only_id: name = specie['name'].split(' ') name = name[0] + ' ' + name[1] else: name = specie['species_id'] # logger.info('The name assigned is %s' %name) points = specie['points'] s = Tree(name=name, support=specie['ab']) s.add_feature('species_id', specie['species_id']) s.add_feature('level', 'species') s.add_feature('points', points) gn_t.add_child(child=s) family_tree.add_child(child=gn_t) return family_tree
def getGenera(taxonomy_queryset,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *species* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet only_id : Boolean (flag) True (default False) means that is going to append the full name of the genera. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :genera_tree: derived from ete2.TreeNode() """ tax = taxonomy_queryset sps = tax.species genera = tax.genera family_tree = Tree(name='genus_root') for genus in genera: family_id = genus['parent_id'] genus_id = genus['genus_id'] if not only_id: name = genus['name'] else: name = genus_id ab = genus['ab'] points = genus['points'] sp_by_gns = sps.filter(genus_id__exact=genus_id) gn_t = Tree(name=name,support=ab) gn_t.add_feature('genus_id', genus_id) gn_t.add_feature('level','genus') gn_t.add_feature('points',points) #logger.info('Building branch for genus %s' %name) for specie in sp_by_gns: if not only_id: name = specie['name'].split(' ') name = name[0]+' '+name[1] else: name = specie['species_id'] # logger.info('The name assigned is %s' %name) points = specie['points'] s = Tree(name = name,support=specie['ab']) s.add_feature('species_id', specie['species_id']) s.add_feature('level','species') s.add_feature('points',points) gn_t.add_child(child=s) family_tree.add_child(child=gn_t) return family_tree
def getClasses(taxonomic_queryset, orders_tree, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *classes* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :orders_tree: Tree derived from getOrders only_id : Boolean (flag) True (default False) means that is going to append the full name of the classes. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :classes_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset classes = tax.classes orders = tax.orders phylumTree = Tree(name='phylum_root') logger.info("[gbif.buildtree] Collapsing Classes") for class_ in classes: phylum_id = class_['parent_id'] if not only_id: name = class_['name'] else: name = class_['class_id'] ab = class_['ab'] #Add here the geometric feature (if necessary) points = class_['points'] class_id = class_['class_id'] #logger.info("Colapsing Class id: %s" %class_id) classTree = Tree(name=name, support=ab) classTree.add_feature('class_id', class_id) classTree.add_feature('level', 'class') classTree.add_feature('points', points) orders_by_class = orders.filter(parent_id__exact=class_id) for order in orders_by_class: id_o = order['order_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.order_id == id_o, orders_tree.get_children())) #print branch # Attach the branch to the family tree classTree.add_child(child=branch) phylumTree.add_child(child=classTree) return phylumTree
def getClasses(taxonomic_queryset,orders_tree,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *classes* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :orders_tree: Tree derived from getOrders only_id : Boolean (flag) True (default False) means that is going to append the full name of the classes. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :classes_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset classes = tax.classes orders = tax.orders phylumTree = Tree(name='phylum_root') logger.info("[gbif.buildtree] Collapsing Classes") for class_ in classes: phylum_id = class_['parent_id'] if not only_id: name = class_['name'] else: name = class_['class_id'] ab = class_['ab'] #Add here the geometric feature (if necessary) points = class_['points'] class_id = class_['class_id'] #logger.info("Colapsing Class id: %s" %class_id) classTree = Tree(name=name,support=ab) classTree.add_feature('id',class_id) classTree.add_feature('abundance',ab) classTree.add_feature('parent_id',phylum_id) classTree.add_feature('class_id',class_id) classTree.add_feature('level','class') classTree.add_feature('points',points) orders_by_class = orders.filter(parent_id__exact=class_id) for order in orders_by_class: id_o = order['order_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.order_id==id_o,orders_tree.get_children())) #print branch # Attach the branch to the family tree classTree.add_child(child=branch) phylumTree.add_child(child=classTree) return phylumTree
def buildTree(taxid_list, nodes_dict, taxids_remove, cursor): """Recursive function, returns a ete tree object from a list of taxids. Requires a cursor connected to a sqlite db build using the script /users/rg/didac/NCBI/Taxonomy/update_sqlite_DB.py nodes_dict is an empty dict taxids_remove is an empty list """ results = query_a_list(taxid_list, cursor) # check if all taxids returned a result if len(set(taxid_list)) != len(results): taxids_with_result = set([ x[0] for x in results]) taxids_remove += list(set(map(int, taxid_list)) - taxids_with_result ) parent_taxid_list = [] for result in results: taxid, parent_taxid, rank, name = result parent_taxid_list.append(parent_taxid) if not taxid in nodes_dict: c = Tree() c.add_feature('name', name) nodes_dict[ taxid ] = c # I don't have scientific name and rank for parent_taxid yet, but next iteration it will be the taxid nodes_dict[ taxid ].add_features(name=name, taxid=taxid, rank=rank) # add child to node parent_taxid if not parent_taxid in nodes_dict: p = Tree() p.add_feature('taxid', parent_taxid) p.add_child( nodes_dict[ taxid ] ) nodes_dict[ parent_taxid ] = p else: # check if taxid is a child of parent_taxid (already in nodes_dict), otherwise adding it for descendant in nodes_dict[ parent_taxid ].iter_descendants(): if taxid == descendant.taxid: break else: nodes_dict[ parent_taxid ].add_child( nodes_dict[ taxid ] ) parent_taxid_list = list(set(parent_taxid_list)) try: # "1" is the root of the NCBI tree, if "1" is in parent_taxid_list, and it will become an empty list inside this try parent_taxid_list.remove(1) except: pass if parent_taxid_list: t,nodes_dict,taxids_remove = buildTree(parent_taxid_list, nodes_dict, taxids_remove, cursor) else: nodes_dict[ 1 ].add_features(name='Root', rank='Root') return nodes_dict[ 1 ], nodes_dict, taxids_remove return t, nodes_dict, taxids_remove
def getFamilies(taxonomic_queryset,genera_tree,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *families* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :genera_tree: Tree derived from getGenera only_id : Boolean (flag) True (default False) means that is going to append the full name of the families. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :families_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset families = tax.families genera = tax.genera orders_tree = Tree(name='order_root') for family in families: order_id = family['parent_id'] if not only_id: name = family['name'] else: name = family['family_id'] ab = family['ab'] #Add here the geometric feature (if necessary) points = family['points'] family_id = family['family_id'] famTree = Tree(name=name,support=ab) famTree.add_feature('abundance',ab) famTree.add_feature('id',family_id) famTree.add_feature('parent_id',order_id) famTree.add_feature('family_id',family_id) famTree.add_feature('level','family') famTree.add_feature('points',points) gens_by_fam = genera.filter(parent_id__exact=family_id) for genus in gens_by_fam: id_g = genus['genus_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.genus_id==id_g,genera_tree.get_children() )) # Attach the branch to the family tree famTree.add_child(child=branch) orders_tree.add_child(child=famTree) return orders_tree
def split_rcm(rcm): n = rcm.shape[0] idgen.reset() root = Tree() root.node_id = idgen.generate() root.name = "%d-%d" % (0, n) root.add_feature("startpos", 0) root.add_feature("endpos", n) _split_rcm(rcm, root) return root
def load_label_tree(noffset_parentidx, noffsets): root = Tree() root_synset = wn.synset('physical_entity.n.01') root.name = root_synset.name() root.add_feature('synset', root_synset) noffset_node = {} for noffset in noffsets: parientid = noffset_parentidx[noffset] if parientid == -1: c = root.add_child(name=noffset) else: parentnode = noffset_node[noffsets[parientid]] c = parentnode.add_child(name=noffset) noffset_node[noffset] = c return prune_root(root), noffset_node
def parseNodesDump(sfin_node, sfin_name, sfout): nodes_rank_map = {} nodes_name_map = {} father_son_map = {} fin = open(sfin_node) lines = fin.readlines() fin.close() print("Number nodes:" + repr(len(lines))) for line in lines: line = line.strip() toks = line.split("|") son = toks[0] father = toks[1] rank = toks[2] nodes_rank_map[son] = rank if father != son: if father in father_son_map: father_son_map[father].append(son) else: father_son_map[father] = [son] cnt = 0 for key in list(father_son_map.keys()): sons = father_son_map[key] cnt = cnt + len(sons) print("Nodes count:" + repr(cnt)) print("Nodes rank count:" + repr(len(nodes_rank_map.keys()))) fnames = open(sfin_name) lines = fnames.readlines() fnames.close() for line in lines: line = line.strip() items = line.split("|") nodes_name_map[items[0]]=items[1] #construct the NCBI taxonomic tree t0 = Tree() t0.add_feature("id", "1") t0.add_feature("rank", "god") t0.add_feature("name", "root") k = 0 nodesque = deque([t0]) while len(nodesque)!=0: p = nodesque.popleft() sons = father_son_map.get(p.id, []) k=k+1 if len(sons)!=0: for son in sons: newnode = p.add_child() newnode.add_feature("id", son) newnode.add_feature("rank", nodes_rank_map.get(son,"no_rank")) newnode.add_feature("name", nodes_name_map.get(son,"no_name")) nodesque.append(newnode) print(k) t0.write(outfile="tree_of_life.tree", format=8, features=["rank","id"])
def parseNodesDump_idonly(sfin_node): father_son_map = {} fin = open(sfin_node) lines = fin.readlines() fin.close() print("Number nodes:" + repr(len(lines))) for line in lines: line = line.strip() toks = line.split("|") son = toks[0] father = toks[1] rank = toks[2] if father != son: if father in father_son_map: father_son_map[father].append(son) else: father_son_map[father] = [son] cnt = 0 for key in list(father_son_map.keys()): sons = father_son_map[key] cnt = cnt + len(sons) print("Nodes count:" + repr(cnt)) t0 = Tree() t0.add_feature("name", "1") k = 0 nodesque = deque([t0]) while len(nodesque)!=0: p = nodesque.popleft() sons = father_son_map.get(p.name, []) k=k+1 if len(sons)!=0: for son in sons: newnode = p.add_child(name = son) nodesque.append(newnode) print(k) t0.write(outfile="tree_of_life_id.tree", format=8)
def getKingdoms(taxonomic_queryset, phyla_tree, only_id=False): """ ... This function generates a Tree object derived from the collapse of all *kingdoms* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :phyla_tree: Tree derived from getKingdoms only_id : Boolean (flag) True (default False) means that is going to append the full name of the kingdoms. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :kingdoms_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset kingdoms = tax.kingdoms phyla = tax.phyla TreeOfLife = Tree(name='Life') logger.info("[gbif.buildtree] Collapsing Kingdoms") for kingdom in kingdoms: kingdom_id = 0 if not only_id: name = kingdom['name'] else: name = kingdom['kingdom_id'] ab = kingdom['ab'] #Add here the geometric feature (if necessary) points = kingdom['points'] kingdom_id = kingdom['kingdom_id'] #logger.info("Colapsing kingdom: %s" %name) kingdomTree = Tree(name=name, support=ab) kingdomTree.add_feature('kingdom_id', kingdom_id) kingdomTree.add_feature('level', 'kingdom') kingdomTree.add_feature('points', points) phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id) for phylum in phyla_by_kingdom: id_p = phylum['phylum_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.phylum_id == id_p, phyla_tree.get_children())) #print branch # Attach the branch to the family tree kingdomTree.add_child(child=branch) TreeOfLife.add_child(child=kingdomTree) return TreeOfLife
def getKingdoms(taxonomic_queryset,phyla_tree,only_id=False): """ ... This function generates a Tree object derived from the collapse of all *kingdoms* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :phyla_tree: Tree derived from getKingdoms only_id : Boolean (flag) True (default False) means that is going to append the full name of the kingdoms. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :kingdoms_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset kingdoms = tax.kingdoms phyla = tax.phyla TreeOfLife = Tree(name='Life') logger.info("[gbif.buildtree] Collapsing Kingdoms") for kingdom in kingdoms: kingdom_id = 0 if not only_id: name = kingdom['name'] else: name = kingdom['kingdom_id'] ab = kingdom['ab'] #Add here the geometric feature (if necessary) points = kingdom['points'] kingdom_id = kingdom['kingdom_id'] #logger.info("Colapsing kingdom: %s" %name) kingdomTree = Tree(name=name,support=ab) kingdomTree.add_feature('kingdom_id',kingdom_id) kingdomTree.add_feature('level','kingdom') kingdomTree.add_feature('points',points) phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id) for phylum in phyla_by_kingdom: id_p = phylum['phylum_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.phylum_id==id_p,phyla_tree.get_children())) #print branch # Attach the branch to the family tree kingdomTree.add_child(child=branch) TreeOfLife.add_child(child=kingdomTree) return TreeOfLife
def getFamilies(taxonomic_queryset, genera_tree, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *families* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :genera_tree: Tree derived from getGenera only_id : Boolean (flag) True (default False) means that is going to append the full name of the families. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :families_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset families = tax.families genera = tax.genera orders_tree = Tree(name='order_root') for family in families: order_id = family['parent_id'] if not only_id: name = family['name'] else: name = family['family_id'] ab = family['ab'] #Add here the geometric feature (if necessary) points = family['points'] family_id = family['family_id'] famTree = Tree(name=name, support=ab) famTree.add_feature('family_id', family_id) famTree.add_feature('level', 'family') famTree.add_feature('points', points) gens_by_fam = genera.filter(parent_id__exact=family_id) for genus in gens_by_fam: id_g = genus['genus_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.genus_id == id_g, genera_tree.get_children())) # Attach the branch to the family tree famTree.add_child(child=branch) orders_tree.add_child(child=famTree) return orders_tree
def getPhyla(taxonomic_queryset,classes_tree,only_id=False): """ ... This function generates a Tree object derived from the collapse of all *phyla* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :classes_tree: Tree derived from getclasses only_id : Boolean (flag) True (default False) means that is going to append the full name of the Phyla. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :phyla_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset phyla = tax.phyla classes = tax.classes kingdomTree = Tree(name='kingdom_root') logger.info("[gbif.buildtree] Collapsing Phyla") for phylum in phyla: kingdom_id = phylum['parent_id'] if not only_id: name = phylum['name'] else: name = phylum['phylum_id'] ab = phylum['ab'] #Add here the geometric feature (if necessary) points = phylum['points'] phylum_id = phylum['phylum_id'] #logger.info("Colapsing Phylum: %s" %name) phylumTree = Tree(name=name,support=ab) phylumTree.add_feature('phylum_id',phylum_id) phylumTree.add_feature('level','phylum') phylumTree.add_feature('points',points) classes_by_phylum = classes.filter(parent_id__exact=phylum_id) for class_ in classes_by_phylum: id_c = class_['class_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.class_id==id_c,classes_tree.get_children())) #print branch # Attach the branch to the family tree phylumTree.add_child(child=branch) kingdomTree.add_child(child=phylumTree) return kingdomTree
def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): t0 = Tree() t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL) self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0 self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0 k = 0 added = 0 seq_ids = [] # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) for sid, ranks in self.taxonomy.iteritems(): k += 1 if self.config.verbose and k % 1000 == 0: print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added # filter by minimum rank level if ranks[min_rank] == Taxonomy.EMPTY_RANK: continue # filter by rank contraints (e.g. class Clostridia only) clade_is_ok = False # check against the inclusion list if len(clades_to_include) > 0: for (rank_level, rank_name) in clades_to_include: if ranks[rank_level] == rank_name: clade_is_ok = True break else: # default: include all clade_is_ok = True # if sequence is about to be included, check it against the ignore list if clade_is_ok: for (rank_level, rank_name) in clades_to_ignore: if ranks[rank_level] == rank_name: clade_is_ok = False break # final decision if not clade_is_ok: continue tax_seq_level = len(ranks) parent_level = tax_seq_level - 1 while ranks[parent_level] == Taxonomy.EMPTY_RANK: parent_level -= 1 parent_name = Taxonomy.get_rank_uid(ranks, parent_level) if parent_name in self.tree_nodes: parent_node = self.tree_nodes[parent_name] # max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) if parent_level == tax_seq_level - 1: max_seq_per_rank = max_seqs_per_leaf # * (tax_seq_level - parent_level) if parent_name in self.leaf_count and self.leaf_count[ parent_name] >= max_seq_per_rank: continue self.leaf_count[parent_name] = self.leaf_count.get(parent_name, 0) + 1 # all checks succeeded: add the sequence to the tree self.add_tree_node(t0, sid, ranks, parent_level) seq_ids += [sid] added += 1 self.config.log.debug("Total nodes in resulting tree: %d", added) if self.config.debug: reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre") t0.write(outfile=reftax_fname, format=8) self.prune_unifu_nodes(t0) return t0, seq_ids
class ncbi_taxa: def __init__(self): self.__tax_tree_root=None self.__id_name_map = {} self.__id_rank_map = {} def init_tax_tree(self, sftree, sfname, sfrank): self.__tax_tree_root = Tree(sftree, format=8) fname = open(sfname) frank = open(sfrank) lines = fname.readlines() for line in lines: line = line.strip() items = line.split("|") self.__id_name_map[items[0]]=items[1] fname.close() lines = frank.readlines() for line in lines: line = line.strip() items = line.split("|") self.__id_rank_map[items[0]]=items[2] frank.close() def extract_sub_tax_tree(self, sname_tax, sfout): seqname_taxid_map = {} taxid = [] fname_tax = open(sname_tax) lines = fname_tax.readlines() for line in lines: line = line.strip() items = line.split() seqname_taxid_map[items[0]]=items[1] taxid.append(items[1]) fname_tax.close() self.__tax_tree_root.prune(taxid) #annotate all nodes for ranks and names rootid = self.__tax_tree_root.name rootname = self.__id_name_map.get(rootid, "noNCBIname") rootname = rootname.replace(" ", "_") rootrank = self.__id_rank_map.get(rootid, "noNCBIrank") rootrank = rootrank.replace(" ", "_") self.__tax_tree_root.add_feature("N", rootname) self.__tax_tree_root.add_feature("R", rootrank) allnodes = self.__tax_tree_root.get_descendants() for node in allnodes: nodeid = node.name nodename = self.__id_name_map.get(nodeid, "noNCBIname") noderank = self.__id_rank_map.get(nodeid, "noNCBIrank") nodename = nodename.replace(" ","_") noderank = noderank.replace(" ","_") node.add_feature("N", nodename) node.add_feature("R", noderank) self.__tax_tree_root.write(outfile=sfout, format=8, features=["N","R"]) print(self.__tax_tree_root)
class phylogeny_annotator: def __init__(self, sphylogeny, s_seq_db, t=0.95): self.tree_input = sphylogeny self.taxonomy_file = s_seq_db self.threshold = t self.root = Tree(sphylogeny, format=1) self.seqs = seq_db() self.seqs.init_db_from_file(s_seq_db) self.max_rank = 5 rks = self.seqs.get_all_rank_names() self.all_rank_names = [] for rk in rks: self.all_rank_names.append(rk[0]) self.nid_freq_map = {} # nid : [[r0name, f0],[r1name, f1],...,[r5name,f5]] self.nid_assigned_map = {} # nid : True or False, indicate if this node rank has been fully determined self.nid_ranks_map = {} # nid : [r0name, r1name, ... ,r5name] self.nid_ranknum_map = {} # nid : final_rank_num def __get_child_ranks(self, internal_node, rank_num): """input:internal node, rank_num; output: rankname frequency map """ leaves = internal_node.get_leaves() rname_cnt_map = {} for leaf in leaves: seq = self.seqs.get_seq_by_name(leaf.name) rank_name = seq.ranks[rank_num] if rank_name in rname_cnt_map: rname_cnt_map[rank_name] = rname_cnt_map[rank_name] + 1 else: rname_cnt_map[rank_name] = 1 return rname_cnt_map def __sum_rank_num(self): s = 0 for nid in self.nid_ranknum_map.keys(): s = s + self.nid_ranknum_map[nid] return s def __count_miss_labled(self): cnt = 0 leave = self.root.get_leaves() for leaf in leave: oriranks = self.seqs.get_seq_by_name(leaf.name).ranks #oriranks.reverse() if self.nid_ranks_map[leaf.nid] != oriranks: print(leaf.name) print("Correct:" + str(self.nid_ranks_map[leaf.nid])) print("Misslab:" + str(oriranks)) cnt = cnt + 1 return cnt def annotate_all_branches_bu(self): i = 0 n_v_map = {} # node id map to vector with probabilities, rank_map = {} # node id to rank number map, rank starting from 0 for node in self.root.traverse("postorder"): i = i + 1 if node.is_leaf(): seq = self.seqs.get_seq_by_name(node.name) rank_num = 0 rank_name = seq.ranks[rank_num] rname_cnt_map = {} rname_cnt_map[rank_name]=1.0 n_v_map[i] = rname_cnt_map rank_map[i] = rank_num node.add_feature("nid", i) else: childs = node.get_children() lchild = childs[0].nid rchild = childs[1].nid #decide which rank to go if rank_map[lchild] == rank_map[rchild]: rname_cnt_map_l = n_v_map[lchild] rname_cnt_map_r = n_v_map[rchild] sorted_rname_cnt_map_l = sorted(rname_cnt_map_l.iteritems(), key=operator.itemgetter(1), reverse = True) sorted_rname_cnt_map_r = sorted(rname_cnt_map_r.iteritems(), key=operator.itemgetter(1), reverse = True) if sorted_rname_cnt_map_l[0][0] == sorted_rname_cnt_map_r[0][0]: rank = rank_map[lchild] else: rank = rank_map[lchild] + 1; if rank > self.max_rank: rank = self.max_rank else: rank = max(rank_map[lchild], rank_map[rchild]) rname_cnt_map = self.__get_child_ranks(node, rank) num_leaves = sum(rname_cnt_map.values()) for rkname in rname_cnt_map.keys(): if rkname == "": continue else: rname_cnt_map[rkname] = float(rname_cnt_map[rkname])/num_leaves n_v_map[i] = rname_cnt_map rank_map[i] = rank node.add_feature("nid", i) #assigning taxa rank: pvalue = 0 ch = self.root.get_children() self.root.add_feature("rankname", "God") self.root.add_feature("pv", 1) #record the max prob of each nodes rank_map[self.root.nid] = 666 #change this to be the max rank of childs + 1 while len(ch) != 0: maxrank = 0 for node in ch: rank_num = rank_map[node.nid] if rank_num >= maxrank: maxrank = rank_num high_rank_nodes = [] for node in ch: rank_num = rank_map[node.nid] if rank_num == maxrank: high_rank_nodes.append(node) for node in high_rank_nodes: ch.remove(node) #process for node in high_rank_nodes: assign_flag = 0 nodefather = node.up father_rank_num = rank_map[nodefather.nid] if father_rank_num == maxrank: node.add_feature("rankname", nodefather.rankname) node.add_feature("pv", nodefather.pv) pvalue = pvalue + node.pv #print("assigning node id: " + str(node.nid) + " with " + str(nodefather.rankname)) assign_flag = 1 else: rname_cnt_map = n_v_map[node.nid] sorted_rname_cnt_map = sorted(rname_cnt_map.iteritems(), key=operator.itemgetter(1), reverse = True) for rname_cnt in sorted_rname_cnt_map: if rname_cnt[0] in self.all_rank_names: node.add_feature("rankname", rname_cnt[0]) node.add_feature("pv", rname_cnt[1]) #Tomas: why we do the following in the first place? I removed it cause it is a bug in curr version #self.all_rank_names.remove(rname_cnt[0]) #print("assigning node id: " + str(node.nid) + " with " +rname_cnt[0]) assign_flag = 1 pvalue = pvalue + node.pv break if assign_flag == 0: node.add_feature("rankname", node.up.rankname) node.add_feature("pv", node.up.pv) pvalue = pvalue + node.pv #print("assigning node id: " + str(node.nid) + " with " +node.up.rankname) for node in high_rank_nodes: ch = ch + node.get_children() return pvalue def tomas(self): flouri = CMislabel(self.tree_input, self.taxonomy_file) self.root = flouri.t self.nid_ranks_map = flouri.nid_ranks return flouri.score() def tomas_rooted(self): flouri = CMislabel(self.tree_input, self.taxonomy_file, self.root) self.nid_ranks_map = flouri.nid_rank def assign_all_descendent_node_rank(self, node, rank_num, rank_name): descent_nodes = node.get_descendants() descent_nodes.append(node) find_error = False for nodei in descent_nodes: if nodei.is_leaf(): if True: #node.is_correct == "yes": seq = self.seqs.get_seq_by_name(nodei.name) ranks = self.nid_ranks_map[nodei.nid] ranks[rank_num] = rank_name self.nid_ranks_map[nodei.nid] = ranks if seq.ranks[rank_num] != rank_name: nodei.add_feature("is_correct", "No") find_error = True else: ranks = self.nid_ranks_map[nodei.nid] ranks[rank_num] = rank_name self.nid_ranks_map[nodei.nid] = ranks if find_error: #recalculate all frequency vectors seq_util = seq_db() for nodei in node.traverse(strategy = "preorder"): leaves = nodei.get_leaves() seqs = [] for leaf in leaves: seqs.append(self.seqs.get_seq_by_name(leaf.name)) freq_table = seq_util.rank_stas(seqs) self.nid_freq_map[nodei.nid] = freq_table def annotate_all_branches_td(self): self.nid_freq_map = {} # nid : [[r0name, f0],[r1name, f1],...,[r5name,f5]] self.nid_assigned_map = {} # nid : True or False, indicate if this node rank has been fully determined self.nid_ranks_map = {} # nid : [r0name, r1name, ... ,r5name] self.nid_ranknum_map = {} # nid : final_rank_num all_leaves = self.root.get_leaves() for leaf in all_leaves: leaf.add_feature("is_correct", "yes") #traversal the tree to calculate the frequence profile for each node/branch seq_util = seq_db() i = 0 for node in self.root.traverse(strategy = "preorder"): i = i + 1 node.add_feature("nid", i) self.nid_assigned_map[i] = False ranks = ["-"] * 6 self.nid_ranks_map[i] = ranks leaves = node.get_leaves() seqs = [] for leaf in leaves: seqs.append(self.seqs.get_seq_by_name(leaf.name)) freq_table = seq_util.rank_stas(seqs) self.nid_freq_map[i] = freq_table #traversal the tree preorder for node in self.root.traverse(strategy = "preorder"): freq_table = self.nid_freq_map[node.nid] ranks = self.nid_ranks_map[node.nid] assigning_rank_idx = 0 if node.is_root(): self.nid_ranknum_map[node.nid] = -1 else: next_rank_idx = self.nid_ranknum_map[node.up.nid] + 1 flag = True while flag: if next_rank_idx < 6: rk_freq = freq_table[next_rank_idx] if rk_freq[1] == 1.0: self.assign_all_descendent_node_rank(node, next_rank_idx, rk_freq[0]) #curr_rank_idx = curr_rank_idx + 1 next_rank_idx = next_rank_idx + 1 else: childs = node.get_children() lchild = childs[0] rchild = childs[1] lfreq_table = self.nid_freq_map[lchild.nid] rfreq_table = self.nid_freq_map[rchild.nid] lrk_freq = lfreq_table[next_rank_idx] rrk_freq = rfreq_table[next_rank_idx] if rk_freq[1] < lrk_freq[1] and rk_freq[1] < rrk_freq[1]: flag = False else: #should check all possibilties here if lrk_freq[0] == rrk_freq[0] and rk_freq[1]>self.threshold: self.assign_all_descendent_node_rank(node, next_rank_idx, rk_freq[0]) next_rank_idx = next_rank_idx + 1 else: flag = False else: #assign taxonomy to species level assigning_rank_idx = 5 rk_freq = freq_table[assigning_rank_idx] flag = False ranks[assigning_rank_idx] = rk_freq[0] self.nid_ranknum_map[node.nid] = next_rank_idx - 1 self.nid_assigned_map[node.nid] = True return self.__sum_rank_num() def show_tree_with_rank(self): allnodes = self.root.get_descendants() for node in allnodes: #rk_num = rank_map[node.nid] #node.add_feature("rank_num", rk_num ) ranks = self.nid_ranks_map[node.nid] node.add_face(TextFace(str(ranks)), column=0, position = "branch-right") if node.is_leaf(): seq = self.seqs.get_seq_by_name(node.name) rk = seq.ranks #rk.reverse() node.add_face(TextFace(str(rk)), column=0, position = "branch-right") self.root.show() def rooting_by_outgroup_names(self, outgroup_names): all_leaves = self.root.get_leaves() sog_names = set(outgroup_names) ca1 = self.root #Traversal all nodes to find the common ancestor of the input outgroup_names for node in self.root.traverse(): currleaves = node.get_leaves() currlnames = [] for lv in currleaves: currlnames.append(lv.name) scurrnames = set(currlnames) if scurrnames == sog_names: ca1 = node break #Check if the found ca is the root, if yes, find the complmentary names of the tree if ca1!=self.root: self.root.set_outgroup(ca1) else: restnodes = [] for leaf in all_leaves: if leaf.name not in sog_names: restnodes.append(leaf.name) srestnodes = set(restnodes) for node in self.root.traverse(): currleaves = node.get_leaves() currlnames = [] for lv in currleaves: currlnames.append(lv.name) scurrnames = set(currlnames) if scurrnames == srestnodes: self.root.set_outgroup(node) break def annotate_td(self): #find all bipartations: list_bipar = [] #all_leaves = self.root.get_leaves() for node in self.root.traverse("postorder"): if not node.is_root(): leaves = node.get_leaves() leave_names = [] for leaf in leaves: leave_names.append(leaf.name) list_bipar.append(leave_names) #find the root: maxpv=0 maxbipar = None for bipar in list_bipar: #Search the current tree to find the partitions: Node0 = self.root.search_nodes(name = bipar[0])[0] if len(bipar) == 1: self.root.set_outgroup(Node0) else: self.rooting_by_outgroup_names(bipar) pvalue = self.annotate_all_branches_td() print(pvalue) misscnt = self.__count_miss_labled() print(misscnt) if pvalue > maxpv: maxpv = pvalue maxbipar = bipar self.rooting_by_outgroup_names(maxbipar) self.annotate_all_branches_td() misscnt = self.__count_miss_labled() print(misscnt) def annotate_bu(self): """rooting and output""" #find all bipartations: list_bipar = [] #all_leaves = self.root.get_leaves() for node in self.root.traverse("postorder"): if not node.is_root(): leaves = node.get_leaves() leave_names = [] for leaf in leaves: leave_names.append(leaf.name) list_bipar.append(leave_names) maxpv=0 maxbipar = None for bipar in list_bipar: #Search the current tree to find the partitions: Node0 = self.root.search_nodes(name = bipar[0])[0] if len(bipar) == 1: self.root.set_outgroup(Node0) else: self.rooting_by_outgroup_names(bipar) pvalue = self.tomas() if pvalue > maxpv: maxpv = pvalue maxbipar = bipar self.rooting_by_outgroup_names(maxbipar) self.tomas_rooted() # draw the tree allnodes = self.root.get_descendants() for node in allnodes: #rk_num = rank_map[node.nid] #node.add_feature("rank_num", rk_num ) if hasattr(node, 'rankname'): node.add_face(TextFace(node.rankname), column=0, position = "branch-right") #node.add_face(TextFace(node.rank_num), column=0, position = "branch-right") #self.root.show() def correct_leaf_ranks(self): leaves = self.root.get_leaves() for leaf in leaves: if not leaf.is_root(): father = leaf.up lranks = self.nid_ranks_map[leaf.nid] franks = self.nid_ranks_map[father.nid] lsp = lranks[5] for i, rk in enumerate(franks): lranks[i] = rk lranks[5] = lsp self.nid_ranks_map[leaf.nid] = lranks
def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): print "Number of nodes: %d" % self.taxonomy.seq_count() t0 = Tree() t0.add_feature("name", "root") self.tree_nodes["root"] = t0 self.leaf_count["root"] = 0 k = 0 added = 0 seq_ids = [] # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) tax_seq_level = self.taxonomy.max_rank_level() + 1 for sid, ranks in self.taxonomy.items(): k += 1 if k % 1000 == 0: print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added # filter by minimum rank level if ranks[min_rank] == "": continue # filter by rank contraints (e.g. class Clostridia only) clade_is_ok = False # check against the inclusion list if len(clades_to_include) > 0: for (rank_level, rank_name) in clades_to_include: if ranks[rank_level] == rank_name: clade_is_ok = True break else: # default: include all clade_is_ok = True # if sequence is about to be included, check it against the ignore list if clade_is_ok: for (rank_level, rank_name) in clades_to_ignore: if ranks[rank_level] == rank_name: clade_is_ok = False break # final decision if not clade_is_ok: continue parent_level = tax_seq_level - 1 while ranks[parent_level] == "": parent_level -= 1 parent_name = ranks[parent_level] if parent_name in self.tree_nodes: parent_node = self.tree_nodes[parent_name] # filter by max number of seqs (threshold depends from rank level, # i.e. for genus there can be more seqs than for species) max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) if parent_name in self.leaf_count and self.leaf_count[ parent_name] >= max_seq_per_rank: continue old_sid_list = [] for node in parent_node.children: if node.is_leaf(): old_sid_list += [int(node.name)] else: old_sid_list = [] # filter non-unique and invalid (e.g. "unaligned") sequences # if not self.align_utils.is_unique_sequence(old_sid_list, int(sid)): # continue if parent_name in self.leaf_count: self.leaf_count[parent_name] += 1 else: # it'll be the first seq for a node, so init counter with 1 self.leaf_count[parent_name] = 1 # all checks succeeded: add the sequence to the tree self.add_tree_node(t0, sid, ranks, parent_level) seq_ids += [sid] added += 1 print "Total nodes in resulting tree: ", added self.prune_unifu_nodes(t0) return t0, seq_ids
def _split_rcm(rcm, t): """ | a | a | a | a | a | a | a | a | | | startpos endpos | | x's startpoint x's endpoint endpos - startpos == number of amino acids in the region but the number of break points are one more than the number of amino acids """ chi_sq_vec = np.zeros(t.endpos - t.startpos + 1) for x in xrange(t.startpos, t.endpos + 1): # from the real start position (which is t.startpos) # to the real end position + 1 (which is t.endpos, but in xrange you should specify one past last) i11 = float(np.sum(rcm[t.startpos:x, t.startpos:x])) i22 = float(np.sum(rcm[x:t.endpos, x:t.endpos])) i12 = float(np.sum(rcm[t.startpos:x, x:t.endpos])) i21 = i12 row1 = i11 + i12 row2 = i21 + i22 col1 = i11 + i21 col2 = i12 + i22 # l1 = x-t.startpos # l2 = t.endpos - x a = i11 * i22 - i21 * i12 # print "i11: %1.0f\ti22: %1.0f\ti12 and i21: %1.0f" % (i11, i22, i12) n = row1 * row2 * col1 * col2 if n > 0.0: chi_sq_vec[x - t.startpos] = a * a / n else: chi_sq_vec[x - t.startpos] = 0.0 # print chi_sq_vec # if chi square statistics is 0, return no split if np.max(chi_sq_vec) == 0.0: return else: # the split point xmax = np.argmax(chi_sq_vec) + t.startpos # if x - t.startpos < min_module_length or t.endpos - x < min_module_length: # return if xmax - t.startpos > min_module_length: # from t.startpos to x - 1 c = Tree() c.node_id = idgen.generate() c.name = "%d-%d" % (t.startpos, xmax) c.add_feature("startpos", t.startpos) c.add_feature("endpos", xmax) t.add_child(c) _split_rcm(rcm, c) if t.endpos - xmax > min_module_length: # from x to t.endpos - 1 c = Tree() c.node_id = idgen.generate() c.name = "%d-%d" % (xmax, t.endpos) c.add_feature("startpos", xmax) c.add_feature("endpos", t.endpos) t.add_child(c) _split_rcm(rcm, c) return
class um_tree: def __init__(self, tree): self.tree = Tree(tree, format = 1) self.tree.resolve_polytomy(default_dist=0.000001, recursive=True) self.tree.dist = 0 self.tree.add_feature("age", 0) self.nodes = self.tree.get_descendants() internal_node = [] cnt = 0 for n in self.nodes: node_age = n.get_distance(self.tree) n.add_feature("age", node_age) if not n.is_leaf(): n.add_feature("id", cnt) cnt = cnt + 1 internal_node.append(n) self.nodes = internal_node one_leaf = self.tree.get_farthest_node()[0] one_leaf.add_feature("id", cnt+1) if one_leaf.is_leaf(): self.nodes.append(one_leaf) self.nodes.sort(key=self.__compare_node) self.species_list = [] self.coa_roots = None def __compare_node(self, node): return node.age def get_waiting_times(self, threshold_node = None, threshold_node_idx = 0): wt_list = [] reach_t = False curr_age = 0.0 curr_spe = 2 curr_num_coa = 0 coa_roots = [] min_brl = 1000 num_spe = -1 if threshold_node == None: threshold_node = self.nodes[threshold_node_idx] last_coa_num = 0 tcnt = 0 for node in self.nodes: num_children = len(node.get_children()) wt = None times = node.age - curr_age if times >= 0: if times < min_brl and times > 0: min_brl = times curr_age = node.age assert curr_spe >=0 if reach_t: if tcnt == 0: last_coa_num = 2 fnode = node.up coa_root = None idx = 0 while not fnode.is_root(): idx = 0 for coa_r in coa_roots: if coa_r.id == fnode.id: coa_root = coa_r break idx = idx + 1 if coa_root!=None: break else: fnode = fnode.up wt = waiting_time(length = times, num_coas =curr_num_coa, num_lines = curr_spe) for coa_r in coa_roots: coa = coalescent(num_individual = coa_r.curr_n) wt.coas.add_coalescent(coa) wt.coas.coas_idx = last_coa_num wt.num_curr_coa = last_coa_num if coa_root == None: #here can be modified to use multiple T curr_spe = curr_spe - 1 curr_num_coa = curr_num_coa + 1 node.add_feature("curr_n", 2) coa_roots.append(node) last_coa_num = 2 else: curr_n = coa_root.curr_n coa_root.add_feature("curr_n", curr_n + 1) last_coa_num = curr_n + 1 tcnt = tcnt + 1 else: if node.id == threshold_node.id: reach_t = True tcnt = 0 wt = waiting_time(length = times, num_coas = 0, num_lines = curr_spe) num_spe = curr_spe curr_spe = curr_spe - 1 curr_num_coa = 2 node.add_feature("curr_n", 2) coa_roots.append(node) else: wt = waiting_time(length = times, num_coas = 0, num_lines = curr_spe) curr_spe = curr_spe + 1 if times > 0.00000001: wt_list.append(wt) for wt in wt_list: wt.count_num_lines() self.species_list = [] all_coa_leaves = [] self.coa_roots = coa_roots for coa_r in coa_roots: leaves = coa_r.get_leaves() all_coa_leaves.extend(leaves) self.species_list.append(leaves) all_leaves = self.tree.get_leaves() for leaf in all_leaves: if leaf not in all_coa_leaves: self.species_list.append([leaf]) return wt_list, num_spe def show(self, wt_list): cnt = 1 for wt in wt_list: print("Waitting interval "+ repr(cnt)) print(wt) cnt = cnt + 1 def get_species(self): sp_list = [] for sp in self.species_list: spe = [] for taxa in sp: spe.append(taxa.name) sp_list.append(spe) all_taxa_name = [] #self.tree.convert_to_ultrametric(tree_length = 1.0, strategy='balanced') for leaf in self.tree.get_leaves(): all_taxa_name.append(leaf.name) style0 = NodeStyle() style0["fgcolor"] = "#000000" #style2["shape"] = "circle" style0["vt_line_color"] = "#0000aa" style0["hz_line_color"] = "#0000aa" style0["vt_line_width"] = 2 style0["hz_line_width"] = 2 style0["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style0["hz_line_type"] = 0 style0["size"] = 0 for node in self.tree.get_descendants(): node.set_style(style0) node.img_style["size"] = 0 self.tree.set_style(style0) self.tree.img_style["size"] = 0 style1 = NodeStyle() style1["fgcolor"] = "#000000" #style2["shape"] = "circle" style1["vt_line_color"] = "#ff0000" style1["hz_line_color"] = "#0000aa" style1["vt_line_width"] = 2 style1["hz_line_width"] = 2 style1["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style1["hz_line_type"] = 0 style1["size"] = 0 style2 = NodeStyle() style2["fgcolor"] = "#0f0f0f" #style2["shape"] = "circle" style2["vt_line_color"] = "#ff0000" style2["hz_line_color"] = "#ff0000" style2["vt_line_width"] = 2 style2["hz_line_width"] = 2 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style2["hz_line_type"] = 0 style2["size"] = 0 for node in self.coa_roots: node.set_style(style1) node.img_style["size"] = 0 for des in node.get_descendants(): des.set_style(style2) des.img_style["size"] = 0 return [all_taxa_name], sp_list def print_species(self): cnt = 1 for sp in self.species_list: print("Species " + repr(cnt) + ":") cnt = cnt + 1 taxas = "" for taxa in sp: taxas = taxas + taxa.name + ", " print(" " + taxas[:-1]) def output_species(self, taxa_order = []): """taxa_order is a list of taxa names, the paritions will be output as the same order""" if len(taxa_order) == 0: taxa_order = self.tree.get_leaf_names() num_taxa = 0 for sp in self.species_list: for taxa in sp: num_taxa = num_taxa + 1 if not len(taxa_order) == num_taxa: print("error error, taxa_order != num_taxa!") return None, None else: partion = [-1] * num_taxa cnt = 1 for sp in self.species_list: for taxa in sp: idx = taxa_order.index(taxa.name) partion[idx] = cnt cnt = cnt + 1 return taxa_order, partion def num_lineages(self, wt_list): nl_list = [] times = [] last_time = 0.0 for wt in wt_list: nl_list.append(wt.get_num_branches()) times.append(last_time) last_time = wt.length + last_time plt.plot(times, nl_list) plt.ylabel('Number of lineages') plt.xlabel('Time') plt.savefig("Time_Lines") plt.show()
def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): print "Number of nodes: %d" % self.taxonomy.seq_count() t0 = Tree() t0.add_feature("name", "root") self.tree_nodes["root"] = t0; self.leaf_count["root"] = 0; k = 0 added = 0 seq_ids = [] # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) tax_seq_level = self.taxonomy.max_rank_level() + 1 for sid, ranks in self.taxonomy.items(): k += 1 if k % 1000 == 0: print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added # filter by minimum rank level if ranks[min_rank] == "": continue # filter by rank contraints (e.g. class Clostridia only) clade_is_ok = False # check against the inclusion list if len(clades_to_include) > 0: for (rank_level, rank_name) in clades_to_include: if ranks[rank_level] == rank_name: clade_is_ok = True break else: # default: include all clade_is_ok = True # if sequence is about to be included, check it against the ignore list if clade_is_ok: for (rank_level, rank_name) in clades_to_ignore: if ranks[rank_level] == rank_name: clade_is_ok = False break # final decision if not clade_is_ok: continue parent_level = tax_seq_level - 1 while ranks[parent_level] == "": parent_level -= 1 parent_name = ranks[parent_level] if parent_name in self.tree_nodes: parent_node = self.tree_nodes[parent_name] # filter by max number of seqs (threshold depends from rank level, # i.e. for genus there can be more seqs than for species) max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank: continue old_sid_list = [] for node in parent_node.children: if node.is_leaf(): old_sid_list += [int(node.name)] else: old_sid_list = [] # filter non-unique and invalid (e.g. "unaligned") sequences # if not self.align_utils.is_unique_sequence(old_sid_list, int(sid)): # continue if parent_name in self.leaf_count: self.leaf_count[parent_name] += 1 else: # it'll be the first seq for a node, so init counter with 1 self.leaf_count[parent_name] = 1 # all checks succeeded: add the sequence to the tree self.add_tree_node(t0, sid, ranks, parent_level) seq_ids += [sid] added += 1 print "Total nodes in resulting tree: ", added self.prune_unifu_nodes(t0) return t0, seq_ids
def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): if self.config.verbose: print "Number of nodes: %d" % self.taxonomy.seq_count() t0 = Tree() t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL) self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0; self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0; k = 0 added = 0 seq_ids = [] # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) tax_seq_level = self.taxonomy.max_rank_level() + 1 for sid, ranks in self.taxonomy.iteritems(): k += 1 if self.config.verbose and k % 1000 == 0: print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added # filter by minimum rank level if ranks[min_rank] == Taxonomy.EMPTY_RANK: continue # filter by rank contraints (e.g. class Clostridia only) clade_is_ok = False # check against the inclusion list if len(clades_to_include) > 0: for (rank_level, rank_name) in clades_to_include: if ranks[rank_level] == rank_name: clade_is_ok = True break else: # default: include all clade_is_ok = True # if sequence is about to be included, check it against the ignore list if clade_is_ok: for (rank_level, rank_name) in clades_to_ignore: if ranks[rank_level] == rank_name: clade_is_ok = False break # final decision if not clade_is_ok: continue parent_level = tax_seq_level - 1 while ranks[parent_level] == Taxonomy.EMPTY_RANK: parent_level -= 1 parent_name = Taxonomy.get_rank_uid(ranks, parent_level) if parent_name in self.tree_nodes: parent_node = self.tree_nodes[parent_name] # filter by max number of seqs (threshold depends from rank level, # i.e. for genus there can be more seqs than for species) max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank: continue self.leaf_count[parent_name] = self.leaf_count.get(parent_name, 0) + 1 # all checks succeeded: add the sequence to the tree self.add_tree_node(t0, sid, ranks, parent_level) seq_ids += [sid] added += 1 if self.config.verbose: print "Total nodes in resulting tree: ", added if self.config.debug: reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre") t0.write(outfile=reftax_fname, format=8) self.prune_unifu_nodes(t0) return t0, seq_ids