class TrackedItem(object):
    def __init__(self):
        self.name = ''
        self.parent = None
        self.data = DataStore(float)
        self.leaf = False
        self.node = Tree()

    @property
    def root(self):
        return self.parent.root if self.parent else self

    def update_stats(self, name, parent, data, sf):
        self.data.merge(data)
        self.name = self.node.name = name
        self.node.item = self
        if parent and self.node not in parent.node.children:
            self.parent = parent
            parent.node.add_child(self.node)

        self.node.add_feature("weight", self.data[sf])
        for key in self.data:
            self.node.add_feature(key, self.data[key])

    def __str__(self):
        return "%s: %s" % (self.name, ','.join(["%d %s" % (self.data[key], key) for key in self.data]))
Beispiel #2
0
def getGenera(taxonomy_queryset, only_id=False):
    """
    .. 
    This function generates a Tree object derived from the collapse 
    of all *species* under the scope of a spatial queryset.
    
    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
    
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the genera.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :genera_tree: derived from ete2.TreeNode()
    """
    tax = taxonomy_queryset
    sps = tax.species
    genera = tax.genera
    family_tree = Tree(name='genus_root')
    for genus in genera:
        family_id = genus['parent_id']
        genus_id = genus['genus_id']
        if not only_id:
            name = genus['name']
        else:
            name = genus_id
        ab = genus['ab']
        points = genus['points']
        sp_by_gns = sps.filter(genus_id__exact=genus_id)
        gn_t = Tree(name=name, support=ab)
        gn_t.add_feature('genus_id', genus_id)
        gn_t.add_feature('level', 'genus')
        gn_t.add_feature('points', points)
        #logger.info('Building branch for genus %s' %name)
        for specie in sp_by_gns:
            if not only_id:
                name = specie['name'].split(' ')
                name = name[0] + ' ' + name[1]
            else:
                name = specie['species_id']


#                 logger.info('The name assigned is %s' %name)
            points = specie['points']
            s = Tree(name=name, support=specie['ab'])
            s.add_feature('species_id', specie['species_id'])

            s.add_feature('level', 'species')
            s.add_feature('points', points)
            gn_t.add_child(child=s)
        family_tree.add_child(child=gn_t)
    return family_tree
Beispiel #3
0
def getGenera(taxonomy_queryset,only_id=False):
    """
    .. 
    This function generates a Tree object derived from the collapse 
    of all *species* under the scope of a spatial queryset.
    
    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
    
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the genera.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :genera_tree: derived from ete2.TreeNode()
    """
    tax = taxonomy_queryset
    sps = tax.species
    genera = tax.genera
    family_tree = Tree(name='genus_root')
    for genus in genera:
        family_id = genus['parent_id']
        genus_id = genus['genus_id']
        if not only_id:
            name = genus['name']
        else:
            name = genus_id
        ab = genus['ab']
        points = genus['points']
        sp_by_gns = sps.filter(genus_id__exact=genus_id)
        gn_t = Tree(name=name,support=ab)
        gn_t.add_feature('genus_id', genus_id)
        gn_t.add_feature('level','genus')
        gn_t.add_feature('points',points)
        #logger.info('Building branch for genus %s' %name)
        for specie in sp_by_gns:
            if not only_id:
                name = specie['name'].split(' ')
                name = name[0]+' '+name[1]
            else:
                name = specie['species_id']
#                 logger.info('The name assigned is %s' %name)
            points = specie['points']
            s = Tree(name = name,support=specie['ab'])
            s.add_feature('species_id', specie['species_id'])
            
            s.add_feature('level','species')
            s.add_feature('points',points)
            gn_t.add_child(child=s)
        family_tree.add_child(child=gn_t)
    return family_tree
Beispiel #4
0
def getClasses(taxonomic_queryset, orders_tree, only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *classes* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :orders_tree: Tree derived from getOrders
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the classes.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :classes_tree: derived from ete2.TreeNode() 


    """
    tax = taxonomic_queryset
    classes = tax.classes
    orders = tax.orders
    phylumTree = Tree(name='phylum_root')
    logger.info("[gbif.buildtree] Collapsing Classes")
    for class_ in classes:
        phylum_id = class_['parent_id']
        if not only_id:
            name = class_['name']
        else:
            name = class_['class_id']
        ab = class_['ab']
        #Add here the geometric feature (if necessary)
        points = class_['points']
        class_id = class_['class_id']
        #logger.info("Colapsing Class id: %s" %class_id)
        classTree = Tree(name=name, support=ab)
        classTree.add_feature('class_id', class_id)
        classTree.add_feature('level', 'class')
        classTree.add_feature('points', points)
        orders_by_class = orders.filter(parent_id__exact=class_id)
        for order in orders_by_class:
            id_o = order['order_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.order_id == id_o,
                       orders_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            classTree.add_child(child=branch)
        phylumTree.add_child(child=classTree)
    return phylumTree
Beispiel #5
0
def getClasses(taxonomic_queryset,orders_tree,only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *classes* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :orders_tree: Tree derived from getOrders
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the classes.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :classes_tree: derived from ete2.TreeNode() 


    """
    tax = taxonomic_queryset
    classes = tax.classes
    orders = tax.orders
    phylumTree = Tree(name='phylum_root')
    logger.info("[gbif.buildtree] Collapsing Classes")
    for class_ in classes:
        phylum_id = class_['parent_id']
        if not only_id: 
            name = class_['name']
        else:
            name = class_['class_id']
        ab = class_['ab']
        #Add here the geometric feature (if necessary)
        points = class_['points']
        class_id = class_['class_id']
        #logger.info("Colapsing Class id: %s" %class_id)
        classTree = Tree(name=name,support=ab)
        classTree.add_feature('id',class_id)
        classTree.add_feature('abundance',ab)
        classTree.add_feature('parent_id',phylum_id)
        classTree.add_feature('class_id',class_id)
        classTree.add_feature('level','class')
        classTree.add_feature('points',points)
        orders_by_class = orders.filter(parent_id__exact=class_id)
        for order in orders_by_class:
            id_o = order['order_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.order_id==id_o,orders_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            classTree.add_child(child=branch)
        phylumTree.add_child(child=classTree)
    return phylumTree  
def buildTree(taxid_list, nodes_dict, taxids_remove, cursor):
    """Recursive function, returns a ete tree object from a list of taxids.
Requires a cursor connected to a sqlite db build using the script /users/rg/didac/NCBI/Taxonomy/update_sqlite_DB.py
nodes_dict is an empty dict
taxids_remove is an empty list """
    
    results = query_a_list(taxid_list, cursor)
    
    # check if all taxids returned a result
    if len(set(taxid_list)) != len(results):
        taxids_with_result = set([ x[0] for x in results])
        taxids_remove += list(set(map(int, taxid_list)) - taxids_with_result )

    parent_taxid_list = []
    for result in results:
        taxid, parent_taxid, rank, name = result
        parent_taxid_list.append(parent_taxid)

        if not taxid in nodes_dict:
            c = Tree()
            c.add_feature('name', name)
            nodes_dict[ taxid ] = c

        # I don't have scientific name and rank for parent_taxid yet, but next iteration it will be the taxid
        nodes_dict[ taxid ].add_features(name=name, taxid=taxid, rank=rank)
        # add child to node parent_taxid
        if not parent_taxid in nodes_dict:
            p = Tree()
            p.add_feature('taxid', parent_taxid)
            p.add_child( nodes_dict[ taxid ] )
            nodes_dict[ parent_taxid ] = p

        else:
            # check if taxid is a child of parent_taxid (already in nodes_dict), otherwise adding it
            for descendant in nodes_dict[ parent_taxid ].iter_descendants():
                if taxid == descendant.taxid:
                    break
            else:
                nodes_dict[ parent_taxid ].add_child( nodes_dict[ taxid ] )

    parent_taxid_list = list(set(parent_taxid_list))

    try:
        # "1" is the root of the NCBI tree, if "1" is in parent_taxid_list, and it will become an empty list inside this try
        parent_taxid_list.remove(1)
    except:
        pass

    if parent_taxid_list:
        t,nodes_dict,taxids_remove = buildTree(parent_taxid_list, nodes_dict, taxids_remove, cursor)
    else:
        nodes_dict[ 1 ].add_features(name='Root', rank='Root')
        return nodes_dict[ 1 ], nodes_dict, taxids_remove
    return t, nodes_dict, taxids_remove
Beispiel #7
0
def getFamilies(taxonomic_queryset,genera_tree,only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *families* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :genera_tree: Tree derived from getGenera
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the families.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :families_tree: derived from ete2.TreeNode()
    """
    tax = taxonomic_queryset
    families = tax.families
    genera = tax.genera
    orders_tree = Tree(name='order_root')
    for family in families:
        order_id = family['parent_id']
        if not only_id:
            name = family['name']
        else:
            name = family['family_id']
        ab = family['ab']
        #Add here the geometric feature (if necessary)
        points = family['points']
        family_id = family['family_id']
        famTree = Tree(name=name,support=ab)
        famTree.add_feature('abundance',ab)
        famTree.add_feature('id',family_id) 
        famTree.add_feature('parent_id',order_id)       
        famTree.add_feature('family_id',family_id)
        famTree.add_feature('level','family')
        famTree.add_feature('points',points)
        gens_by_fam = genera.filter(parent_id__exact=family_id)
        for genus in gens_by_fam:
            id_g = genus['genus_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.genus_id==id_g,genera_tree.get_children() ))
            # Attach the branch to the family tree
            famTree.add_child(child=branch)
        orders_tree.add_child(child=famTree)
    return orders_tree
def split_rcm(rcm):
    n = rcm.shape[0]
    idgen.reset()

    root = Tree()
    root.node_id = idgen.generate()
    root.name = "%d-%d" % (0, n)
    root.add_feature("startpos", 0)
    root.add_feature("endpos", n)

    _split_rcm(rcm, root)

    return root
Beispiel #9
0
def load_label_tree(noffset_parentidx, noffsets):
    root = Tree()
    root_synset = wn.synset('physical_entity.n.01')
    root.name = root_synset.name()
    root.add_feature('synset', root_synset)
    noffset_node = {}
    for noffset in noffsets:
        parientid = noffset_parentidx[noffset]
        if parientid == -1:
            c = root.add_child(name=noffset)
        else:
            parentnode = noffset_node[noffsets[parientid]]
            c = parentnode.add_child(name=noffset)
        noffset_node[noffset] = c
    return prune_root(root), noffset_node
def parseNodesDump(sfin_node, sfin_name, sfout):
    nodes_rank_map = {}
    nodes_name_map = {}
    father_son_map = {}
    fin = open(sfin_node)
    lines = fin.readlines()
    fin.close()
    
    print("Number nodes:" + repr(len(lines)))
    for line in lines:
        line = line.strip()
        toks = line.split("|")
        son = toks[0]
        father = toks[1]
        rank = toks[2]
        
        nodes_rank_map[son] = rank
        if father != son:
            if father in father_son_map:
                father_son_map[father].append(son)
            else:
                father_son_map[father] = [son]
    cnt = 0
    for key in list(father_son_map.keys()):
        sons = father_son_map[key]
        cnt = cnt + len(sons)
    print("Nodes count:" + repr(cnt))
    print("Nodes rank count:" + repr(len(nodes_rank_map.keys())))
    
    fnames = open(sfin_name)
    lines = fnames.readlines()
    fnames.close()
    
    for line in lines:
        line = line.strip()
        items = line.split("|")
        nodes_name_map[items[0]]=items[1]
    
    #construct the NCBI taxonomic tree
    t0 = Tree()
    t0.add_feature("id", "1")
    t0.add_feature("rank", "god")
    t0.add_feature("name", "root")
    k = 0
    nodesque = deque([t0])
    while len(nodesque)!=0:
         p = nodesque.popleft()
         sons = father_son_map.get(p.id, [])
         k=k+1
         if len(sons)!=0:
             for son in sons:
                 newnode = p.add_child()
                 newnode.add_feature("id", son)
                 newnode.add_feature("rank", nodes_rank_map.get(son,"no_rank"))
                 newnode.add_feature("name", nodes_name_map.get(son,"no_name"))
                 nodesque.append(newnode)
    print(k)    
    t0.write(outfile="tree_of_life.tree", format=8, features=["rank","id"])
def parseNodesDump_idonly(sfin_node):
    father_son_map = {}
    fin = open(sfin_node)
    lines = fin.readlines()
    fin.close()
    
    print("Number nodes:" + repr(len(lines)))
    for line in lines:
        line = line.strip()
        toks = line.split("|")
        son = toks[0]
        father = toks[1]
        rank = toks[2]
        if father != son:
            if father in father_son_map:
                father_son_map[father].append(son)
            else:
                father_son_map[father] = [son]
    cnt = 0
    for key in list(father_son_map.keys()):
        sons = father_son_map[key]
        cnt = cnt + len(sons)
    print("Nodes count:" + repr(cnt))
    t0 = Tree()
    t0.add_feature("name", "1")
    k = 0
    nodesque = deque([t0])
    while len(nodesque)!=0:
         p = nodesque.popleft()
         sons = father_son_map.get(p.name, [])
         k=k+1
         if len(sons)!=0:
             for son in sons:
                 newnode = p.add_child(name = son)
                 nodesque.append(newnode)
    print(k)    
    t0.write(outfile="tree_of_life_id.tree", format=8)
Beispiel #12
0
def getKingdoms(taxonomic_queryset, phyla_tree, only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *kingdoms* under the scope of a spatial queryset.

    Parameters
    ----------
        taxonomy_queryset gbif.models / GeoquerySet
            :phyla_tree: Tree derived from getKingdoms

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the kingdoms.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :kingdoms_tree: derived from ete2.TreeNode()   
    
    
    """
    tax = taxonomic_queryset
    kingdoms = tax.kingdoms
    phyla = tax.phyla
    TreeOfLife = Tree(name='Life')
    logger.info("[gbif.buildtree] Collapsing Kingdoms")
    for kingdom in kingdoms:
        kingdom_id = 0
        if not only_id:
            name = kingdom['name']
        else:
            name = kingdom['kingdom_id']
        ab = kingdom['ab']
        #Add here the geometric feature (if necessary)
        points = kingdom['points']
        kingdom_id = kingdom['kingdom_id']
        #logger.info("Colapsing kingdom: %s" %name)
        kingdomTree = Tree(name=name, support=ab)
        kingdomTree.add_feature('kingdom_id', kingdom_id)
        kingdomTree.add_feature('level', 'kingdom')
        kingdomTree.add_feature('points', points)
        phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id)
        for phylum in phyla_by_kingdom:
            id_p = phylum['phylum_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.phylum_id == id_p,
                       phyla_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            kingdomTree.add_child(child=branch)
        TreeOfLife.add_child(child=kingdomTree)
    return TreeOfLife
Beispiel #13
0
def getKingdoms(taxonomic_queryset,phyla_tree,only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *kingdoms* under the scope of a spatial queryset.

    Parameters
    ----------
        taxonomy_queryset gbif.models / GeoquerySet
            :phyla_tree: Tree derived from getKingdoms

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the kingdoms.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :kingdoms_tree: derived from ete2.TreeNode()   
    
    
    """
    tax = taxonomic_queryset
    kingdoms = tax.kingdoms
    phyla = tax.phyla
    TreeOfLife = Tree(name='Life')
    logger.info("[gbif.buildtree] Collapsing Kingdoms")
    for kingdom in kingdoms:
        kingdom_id = 0
        if not only_id:         
            name = kingdom['name']
        else:
            name = kingdom['kingdom_id']    
        ab = kingdom['ab']
        #Add here the geometric feature (if necessary)
        points = kingdom['points']
        kingdom_id = kingdom['kingdom_id']
        #logger.info("Colapsing kingdom: %s" %name)
        kingdomTree = Tree(name=name,support=ab)
        kingdomTree.add_feature('kingdom_id',kingdom_id)
        kingdomTree.add_feature('level','kingdom')
        kingdomTree.add_feature('points',points)
        phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id)
        for phylum in phyla_by_kingdom:
            id_p = phylum['phylum_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.phylum_id==id_p,phyla_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            kingdomTree.add_child(child=branch)
        TreeOfLife.add_child(child=kingdomTree)
    return TreeOfLife  
Beispiel #14
0
def getFamilies(taxonomic_queryset, genera_tree, only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *families* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :genera_tree: Tree derived from getGenera
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the families.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :families_tree: derived from ete2.TreeNode()
    """
    tax = taxonomic_queryset
    families = tax.families
    genera = tax.genera
    orders_tree = Tree(name='order_root')
    for family in families:
        order_id = family['parent_id']
        if not only_id:
            name = family['name']
        else:
            name = family['family_id']
        ab = family['ab']
        #Add here the geometric feature (if necessary)
        points = family['points']
        family_id = family['family_id']
        famTree = Tree(name=name, support=ab)
        famTree.add_feature('family_id', family_id)
        famTree.add_feature('level', 'family')
        famTree.add_feature('points', points)
        gens_by_fam = genera.filter(parent_id__exact=family_id)
        for genus in gens_by_fam:
            id_g = genus['genus_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.genus_id == id_g,
                       genera_tree.get_children()))
            # Attach the branch to the family tree
            famTree.add_child(child=branch)
        orders_tree.add_child(child=famTree)
    return orders_tree
Beispiel #15
0
def getPhyla(taxonomic_queryset,classes_tree,only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *phyla* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :classes_tree: Tree derived from getclasses

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the Phyla.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
  
    
    Returns
    -------
    :phyla_tree: derived from ete2.TreeNode()   
    """
    tax = taxonomic_queryset
    phyla = tax.phyla
    classes = tax.classes
    kingdomTree = Tree(name='kingdom_root')
    logger.info("[gbif.buildtree] Collapsing Phyla")
    for phylum in phyla:
        kingdom_id = phylum['parent_id']
        if not only_id:         
            name = phylum['name']
        else:
            name = phylum['phylum_id']
        ab = phylum['ab']
        #Add here the geometric feature (if necessary)
        points = phylum['points']
        phylum_id = phylum['phylum_id']
        #logger.info("Colapsing Phylum: %s" %name)
        phylumTree = Tree(name=name,support=ab)
        phylumTree.add_feature('phylum_id',phylum_id)
        phylumTree.add_feature('level','phylum')
        phylumTree.add_feature('points',points)
        classes_by_phylum = classes.filter(parent_id__exact=phylum_id)
        for class_ in classes_by_phylum:
            id_c = class_['class_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.class_id==id_c,classes_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            phylumTree.add_child(child=branch)
        kingdomTree.add_child(child=phylumTree)
    return kingdomTree  
Beispiel #16
0
    def build(self,
              min_rank=0,
              max_seqs_per_leaf=1e9,
              clades_to_include=[],
              clades_to_ignore=[]):

        t0 = Tree()
        t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL)
        self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0
        self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0
        k = 0
        added = 0
        seq_ids = []
        # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1)
        for sid, ranks in self.taxonomy.iteritems():
            k += 1
            if self.config.verbose and k % 1000 == 0:
                print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added

            # filter by minimum rank level
            if ranks[min_rank] == Taxonomy.EMPTY_RANK:
                continue

            # filter by rank contraints (e.g. class Clostridia only)
            clade_is_ok = False

            # check against the inclusion list
            if len(clades_to_include) > 0:
                for (rank_level, rank_name) in clades_to_include:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = True
                        break
            else:  # default: include all
                clade_is_ok = True

            # if sequence is about to be included, check it against the ignore list
            if clade_is_ok:
                for (rank_level, rank_name) in clades_to_ignore:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = False
                        break

            # final decision
            if not clade_is_ok:
                continue

            tax_seq_level = len(ranks)
            parent_level = tax_seq_level - 1
            while ranks[parent_level] == Taxonomy.EMPTY_RANK:
                parent_level -= 1
            parent_name = Taxonomy.get_rank_uid(ranks, parent_level)
            if parent_name in self.tree_nodes:
                parent_node = self.tree_nodes[parent_name]
                #                max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level)
                if parent_level == tax_seq_level - 1:
                    max_seq_per_rank = max_seqs_per_leaf  # * (tax_seq_level - parent_level)
                    if parent_name in self.leaf_count and self.leaf_count[
                            parent_name] >= max_seq_per_rank:
                        continue

            self.leaf_count[parent_name] = self.leaf_count.get(parent_name,
                                                               0) + 1

            # all checks succeeded: add the sequence to the tree
            self.add_tree_node(t0, sid, ranks, parent_level)
            seq_ids += [sid]
            added += 1

        self.config.log.debug("Total nodes in resulting tree: %d", added)

        if self.config.debug:
            reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre")
            t0.write(outfile=reftax_fname, format=8)

        self.prune_unifu_nodes(t0)
        return t0, seq_ids
class ncbi_taxa:
    
    def __init__(self):
        self.__tax_tree_root=None
        self.__id_name_map = {}
        self.__id_rank_map = {}
    
    def init_tax_tree(self, sftree, sfname, sfrank):
        self.__tax_tree_root = Tree(sftree, format=8)
        fname = open(sfname)
        frank = open(sfrank)
        
        lines = fname.readlines()
        for line in lines:
            line = line.strip()
            items = line.split("|")
            self.__id_name_map[items[0]]=items[1]
        fname.close()
        
        lines = frank.readlines()
        for line in lines:
            line = line.strip()
            items = line.split("|")
            self.__id_rank_map[items[0]]=items[2]
        frank.close()
    
    def extract_sub_tax_tree(self, sname_tax, sfout):
        seqname_taxid_map = {}
        taxid = []
        fname_tax = open(sname_tax)
        lines = fname_tax.readlines()
        for line in lines:
            line = line.strip()
            items = line.split()
            seqname_taxid_map[items[0]]=items[1]
            taxid.append(items[1])
        fname_tax.close()
        self.__tax_tree_root.prune(taxid)
        
        #annotate all nodes for ranks and names
        rootid = self.__tax_tree_root.name
        rootname = self.__id_name_map.get(rootid, "noNCBIname")
        rootname = rootname.replace(" ", "_")
        rootrank = self.__id_rank_map.get(rootid, "noNCBIrank")
        rootrank = rootrank.replace(" ", "_")
        self.__tax_tree_root.add_feature("N", rootname)
        self.__tax_tree_root.add_feature("R", rootrank)
        
        
        allnodes = self.__tax_tree_root.get_descendants()
        for node in allnodes:
            nodeid = node.name
            nodename = self.__id_name_map.get(nodeid, "noNCBIname")
            noderank = self.__id_rank_map.get(nodeid, "noNCBIrank")
            nodename = nodename.replace(" ","_")
            noderank = noderank.replace(" ","_")
            node.add_feature("N", nodename)
            node.add_feature("R", noderank)
            
        self.__tax_tree_root.write(outfile=sfout, format=8, features=["N","R"])
        print(self.__tax_tree_root)
class phylogeny_annotator:
    
    def __init__(self, sphylogeny, s_seq_db, t=0.95):
        self.tree_input = sphylogeny
        self.taxonomy_file = s_seq_db
        self.threshold = t
        self.root = Tree(sphylogeny, format=1)
        self.seqs = seq_db()
        self.seqs.init_db_from_file(s_seq_db)
        self.max_rank = 5 
        rks = self.seqs.get_all_rank_names()
        self.all_rank_names = []
        for rk in rks:
            self.all_rank_names.append(rk[0])
        self.nid_freq_map = {} # nid : [[r0name, f0],[r1name, f1],...,[r5name,f5]]
        self.nid_assigned_map = {} # nid : True or False, indicate if this node rank has been fully determined
        self.nid_ranks_map = {} #  nid : [r0name, r1name, ... ,r5name]
        self.nid_ranknum_map = {} #  nid : final_rank_num
    
    def __get_child_ranks(self, internal_node, rank_num):
        """input:internal node, rank_num; output: rankname frequency map """
        leaves = internal_node.get_leaves()
        rname_cnt_map = {}
        for leaf in leaves:
                seq = self.seqs.get_seq_by_name(leaf.name)
                rank_name = seq.ranks[rank_num]
                if rank_name in rname_cnt_map:
                        rname_cnt_map[rank_name] = rname_cnt_map[rank_name] + 1
                else:
                        rname_cnt_map[rank_name] = 1 
        return rname_cnt_map
    
    def __sum_rank_num(self):
        s = 0
        for nid in self.nid_ranknum_map.keys():
            s = s + self.nid_ranknum_map[nid]
        return s
    
    def __count_miss_labled(self):
        cnt = 0
        leave = self.root.get_leaves()
        for leaf in leave:
            oriranks = self.seqs.get_seq_by_name(leaf.name).ranks
            #oriranks.reverse()
            if self.nid_ranks_map[leaf.nid] != oriranks:
                print(leaf.name)
                print("Correct:" + str(self.nid_ranks_map[leaf.nid]))
                print("Misslab:" + str(oriranks))
                cnt = cnt + 1
        return cnt
    
    def annotate_all_branches_bu(self):
        i = 0 
        n_v_map = {} # node id map to vector with probabilities,
        rank_map = {} # node id to rank number map, rank starting from 0
        for node in self.root.traverse("postorder"):
            i = i + 1
            if node.is_leaf():
                seq = self.seqs.get_seq_by_name(node.name)
                rank_num = 0
                rank_name = seq.ranks[rank_num]
                rname_cnt_map = {}
                rname_cnt_map[rank_name]=1.0
                n_v_map[i] = rname_cnt_map       
                rank_map[i] = rank_num       
                node.add_feature("nid", i)
            else:
                childs = node.get_children()
                lchild = childs[0].nid
                rchild = childs[1].nid
                #decide which rank to go
                if rank_map[lchild] == rank_map[rchild]:
                    rname_cnt_map_l = n_v_map[lchild]
                    rname_cnt_map_r = n_v_map[rchild]
                    sorted_rname_cnt_map_l = sorted(rname_cnt_map_l.iteritems(), key=operator.itemgetter(1), reverse = True)
                    sorted_rname_cnt_map_r = sorted(rname_cnt_map_r.iteritems(), key=operator.itemgetter(1), reverse = True)
                    if sorted_rname_cnt_map_l[0][0] == sorted_rname_cnt_map_r[0][0]:
                        rank = rank_map[lchild]
                    else:
                        rank = rank_map[lchild] + 1;
                    if rank > self.max_rank:
                        rank = self.max_rank
                else:
                    rank = max(rank_map[lchild], rank_map[rchild])
                    
                rname_cnt_map = self.__get_child_ranks(node, rank)
                num_leaves = sum(rname_cnt_map.values())

                for rkname in rname_cnt_map.keys():
                    if rkname == "":
                        continue
                    else:
                        rname_cnt_map[rkname] = float(rname_cnt_map[rkname])/num_leaves
           
                n_v_map[i] = rname_cnt_map
                rank_map[i] = rank
                node.add_feature("nid", i)
            
        #assigning taxa rank:
        pvalue = 0
        ch = self.root.get_children()
        self.root.add_feature("rankname", "God")
        self.root.add_feature("pv", 1) #record the max prob of each nodes
        rank_map[self.root.nid] = 666 #change this to be the max rank of childs + 1 
        while len(ch) != 0:
            maxrank = 0
            for node in ch:
                rank_num = rank_map[node.nid]
                if rank_num >= maxrank:
                    maxrank = rank_num 
                    
            high_rank_nodes = []
            for node in ch:
                rank_num = rank_map[node.nid]
                if rank_num == maxrank:
                    high_rank_nodes.append(node)
                   
            for node in high_rank_nodes:
                ch.remove(node)
                
            #process
            for node in high_rank_nodes:
                assign_flag = 0
                nodefather = node.up
                father_rank_num = rank_map[nodefather.nid]
                if father_rank_num == maxrank:
                    node.add_feature("rankname", nodefather.rankname)
                    node.add_feature("pv", nodefather.pv)
                    pvalue = pvalue + node.pv
                    #print("assigning node id: " + str(node.nid) + " with " + str(nodefather.rankname))
                    assign_flag = 1
                else:
                    rname_cnt_map = n_v_map[node.nid]
                    sorted_rname_cnt_map = sorted(rname_cnt_map.iteritems(), key=operator.itemgetter(1), reverse = True)
                    for rname_cnt in sorted_rname_cnt_map:
                        if rname_cnt[0] in self.all_rank_names:
                            node.add_feature("rankname", rname_cnt[0])
                            node.add_feature("pv", rname_cnt[1])
                            #Tomas: why we do the following in the first place? I removed it cause it is a bug in curr version 
                            #self.all_rank_names.remove(rname_cnt[0])
                            #print("assigning node id: " + str(node.nid) + " with " +rname_cnt[0])
                            assign_flag = 1
                            pvalue = pvalue + node.pv
                            break
                    
                if assign_flag == 0:
                    node.add_feature("rankname", node.up.rankname)
                    node.add_feature("pv", node.up.pv)
                    pvalue = pvalue + node.pv
                    #print("assigning node id: " + str(node.nid) + " with " +node.up.rankname)
                    
            for node in high_rank_nodes:
                ch = ch + node.get_children()
        return pvalue        
    
    def tomas(self):
        flouri = CMislabel(self.tree_input, self.taxonomy_file)
        self.root = flouri.t 
        self.nid_ranks_map = flouri.nid_ranks
        return flouri.score()
    
    def tomas_rooted(self):
        flouri = CMislabel(self.tree_input, self.taxonomy_file, self.root)
        self.nid_ranks_map = flouri.nid_rank    
    
    def assign_all_descendent_node_rank(self, node, rank_num, rank_name):
        descent_nodes = node.get_descendants()
        descent_nodes.append(node)
        find_error = False
        for nodei in descent_nodes:
            if nodei.is_leaf():
                if True: #node.is_correct == "yes":
                    seq = self.seqs.get_seq_by_name(nodei.name)
                    ranks = self.nid_ranks_map[nodei.nid]
                    ranks[rank_num] = rank_name
                    self.nid_ranks_map[nodei.nid] = ranks
                    if seq.ranks[rank_num] != rank_name:
                        nodei.add_feature("is_correct", "No")
                        find_error = True
            else:
                ranks = self.nid_ranks_map[nodei.nid]
                ranks[rank_num] = rank_name
                self.nid_ranks_map[nodei.nid] = ranks
        if find_error:
            #recalculate all frequency vectors
            seq_util = seq_db()
            for nodei in node.traverse(strategy = "preorder"):
                leaves = nodei.get_leaves()
                seqs = []
                for leaf in leaves:
                    seqs.append(self.seqs.get_seq_by_name(leaf.name))
                    freq_table = seq_util.rank_stas(seqs)
                    self.nid_freq_map[nodei.nid] = freq_table
    
    def annotate_all_branches_td(self):
        self.nid_freq_map = {} # nid : [[r0name, f0],[r1name, f1],...,[r5name,f5]]
        self.nid_assigned_map = {} # nid : True or False, indicate if this node rank has been fully determined
        self.nid_ranks_map = {} #  nid : [r0name, r1name, ... ,r5name]
        self.nid_ranknum_map = {} #  nid : final_rank_num
        all_leaves = self.root.get_leaves()
        for leaf in all_leaves:
            leaf.add_feature("is_correct", "yes")
        #traversal the tree to calculate the frequence profile for each node/branch
        seq_util = seq_db()
        i = 0
        for node in self.root.traverse(strategy = "preorder"):
            i = i + 1
            node.add_feature("nid", i)
            self.nid_assigned_map[i] = False
            ranks = ["-"] * 6
            self.nid_ranks_map[i] = ranks
            leaves = node.get_leaves()
            seqs = []
            for leaf in leaves:
                seqs.append(self.seqs.get_seq_by_name(leaf.name))
            freq_table = seq_util.rank_stas(seqs)
            self.nid_freq_map[i] = freq_table
        
        #traversal the tree preorder
        for node in self.root.traverse(strategy = "preorder"):
            freq_table = self.nid_freq_map[node.nid]
            ranks = self.nid_ranks_map[node.nid]
            assigning_rank_idx = 0
            if node.is_root():
                self.nid_ranknum_map[node.nid] = -1
            else:
                next_rank_idx = self.nid_ranknum_map[node.up.nid] + 1
                flag = True
                while flag:
                    if next_rank_idx < 6:
                        rk_freq = freq_table[next_rank_idx]
                        if rk_freq[1] == 1.0:
                            self.assign_all_descendent_node_rank(node, next_rank_idx, rk_freq[0])
                            #curr_rank_idx = curr_rank_idx + 1
                            next_rank_idx = next_rank_idx + 1
                        else:
                            childs = node.get_children()
                            lchild = childs[0]
                            rchild = childs[1]
                            lfreq_table = self.nid_freq_map[lchild.nid]
                            rfreq_table = self.nid_freq_map[rchild.nid]
                            lrk_freq = lfreq_table[next_rank_idx]
                            rrk_freq = rfreq_table[next_rank_idx]
                            if rk_freq[1] < lrk_freq[1] and rk_freq[1] < rrk_freq[1]:
                                flag = False
                            else: #should check all possibilties here 
                                if lrk_freq[0] == rrk_freq[0] and rk_freq[1]>self.threshold:
                                    self.assign_all_descendent_node_rank(node, next_rank_idx, rk_freq[0])
                                    next_rank_idx = next_rank_idx + 1
                                else:
                                    flag = False
                    else:
                        #assign taxonomy to species level
                        assigning_rank_idx = 5
                        rk_freq = freq_table[assigning_rank_idx]
                        flag = False
                        ranks[assigning_rank_idx] = rk_freq[0]
                self.nid_ranknum_map[node.nid] = next_rank_idx - 1
            self.nid_assigned_map[node.nid] = True
        return self.__sum_rank_num()
    
    def show_tree_with_rank(self):
        allnodes = self.root.get_descendants()
        for node in allnodes:
            #rk_num = rank_map[node.nid]
            #node.add_feature("rank_num", rk_num )
            ranks = self.nid_ranks_map[node.nid]
            node.add_face(TextFace(str(ranks)), column=0, position = "branch-right")
            if node.is_leaf():
                seq = self.seqs.get_seq_by_name(node.name)
                rk = seq.ranks
                #rk.reverse()
                node.add_face(TextFace(str(rk)), column=0, position = "branch-right")
        self.root.show()
    
    def rooting_by_outgroup_names(self, outgroup_names):
        all_leaves = self.root.get_leaves()
        sog_names = set(outgroup_names)
        ca1 = self.root
        #Traversal all nodes to find the common ancestor of the input outgroup_names
        for node in self.root.traverse():
            currleaves = node.get_leaves()
            currlnames = []
            for lv in currleaves:
                currlnames.append(lv.name)
            scurrnames = set(currlnames)
            if scurrnames == sog_names:
                ca1 = node
            break
        #Check if the found ca is the root, if yes, find the complmentary names of the tree
        if ca1!=self.root:
            self.root.set_outgroup(ca1)
        else:
            restnodes = []
            for leaf in all_leaves:
                if leaf.name not in sog_names:
                    restnodes.append(leaf.name)
            srestnodes = set(restnodes)
            for node in self.root.traverse():
                currleaves = node.get_leaves()
                currlnames = []
                for lv in currleaves:
                    currlnames.append(lv.name)
                scurrnames = set(currlnames)
                if scurrnames == srestnodes:
                    self.root.set_outgroup(node)
                    break
    
    def annotate_td(self):
        #find all bipartations:
        list_bipar = []
        #all_leaves = self.root.get_leaves()
        for node in self.root.traverse("postorder"):
            if not node.is_root():
                leaves = node.get_leaves()
                leave_names = []
                for leaf in leaves:
                    leave_names.append(leaf.name) 
                list_bipar.append(leave_names)
        
        #find the root:
        maxpv=0
        maxbipar = None
        for bipar in list_bipar:
            #Search the current tree to find the partitions:
            Node0 = self.root.search_nodes(name = bipar[0])[0]
            if len(bipar) == 1:
                self.root.set_outgroup(Node0)
            else:
                self.rooting_by_outgroup_names(bipar)
            pvalue = self.annotate_all_branches_td()
            print(pvalue)
            misscnt = self.__count_miss_labled()
            print(misscnt)
            if pvalue > maxpv:
                maxpv = pvalue
                maxbipar = bipar
        self.rooting_by_outgroup_names(maxbipar)
        self.annotate_all_branches_td()
        misscnt = self.__count_miss_labled()
        print(misscnt)
    
    def annotate_bu(self):
        """rooting and output"""
        #find all bipartations:
        list_bipar = []
        #all_leaves = self.root.get_leaves()
        for node in self.root.traverse("postorder"):
            if not node.is_root():
                leaves = node.get_leaves()
                leave_names = []
                for leaf in leaves:
                    leave_names.append(leaf.name) 
                list_bipar.append(leave_names)
                
        maxpv=0
        maxbipar = None
        for bipar in list_bipar:
            #Search the current tree to find the partitions:
            Node0 = self.root.search_nodes(name = bipar[0])[0]
            if len(bipar) == 1:
                self.root.set_outgroup(Node0)
            else:
                self.rooting_by_outgroup_names(bipar)
            
            pvalue = self.tomas()
            if pvalue > maxpv:
                maxpv = pvalue
                maxbipar = bipar 
                                    
        self.rooting_by_outgroup_names(maxbipar)
        self.tomas_rooted()
        # draw the tree      
        allnodes = self.root.get_descendants()
        for node in allnodes:
            #rk_num = rank_map[node.nid]
            #node.add_feature("rank_num", rk_num )
            if hasattr(node, 'rankname'):
                node.add_face(TextFace(node.rankname), column=0, position = "branch-right")
                #node.add_face(TextFace(node.rank_num), column=0, position = "branch-right")
        #self.root.show()
    
    def correct_leaf_ranks(self):
        leaves = self.root.get_leaves()
        for leaf in leaves:
            if not leaf.is_root():
                father = leaf.up
                lranks = self.nid_ranks_map[leaf.nid]
                franks = self.nid_ranks_map[father.nid]
                lsp = lranks[5]
                for i, rk in enumerate(franks):
                    lranks[i] = rk
                lranks[5] = lsp 
                self.nid_ranks_map[leaf.nid] = lranks
Beispiel #19
0
    def build(self,
              min_rank=0,
              max_seqs_per_leaf=1e9,
              clades_to_include=[],
              clades_to_ignore=[]):

        print "Number of nodes: %d" % self.taxonomy.seq_count()
        t0 = Tree()
        t0.add_feature("name", "root")
        self.tree_nodes["root"] = t0
        self.leaf_count["root"] = 0
        k = 0
        added = 0
        seq_ids = []
        # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1)
        tax_seq_level = self.taxonomy.max_rank_level() + 1
        for sid, ranks in self.taxonomy.items():
            k += 1
            if k % 1000 == 0:
                print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added

            # filter by minimum rank level
            if ranks[min_rank] == "":
                continue

            # filter by rank contraints (e.g. class Clostridia only)
            clade_is_ok = False

            # check against the inclusion list
            if len(clades_to_include) > 0:
                for (rank_level, rank_name) in clades_to_include:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = True
                        break
            else:  # default: include all
                clade_is_ok = True

            # if sequence is about to be included, check it against the ignore list
            if clade_is_ok:
                for (rank_level, rank_name) in clades_to_ignore:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = False
                        break

            # final decision
            if not clade_is_ok:
                continue

            parent_level = tax_seq_level - 1
            while ranks[parent_level] == "":
                parent_level -= 1
            parent_name = ranks[parent_level]
            if parent_name in self.tree_nodes:
                parent_node = self.tree_nodes[parent_name]
                # filter by max number of seqs (threshold depends from rank level,
                # i.e. for genus there can be more seqs than for species)
                max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level -
                                                        parent_level)
                if parent_name in self.leaf_count and self.leaf_count[
                        parent_name] >= max_seq_per_rank:
                    continue

                old_sid_list = []
                for node in parent_node.children:
                    if node.is_leaf():
                        old_sid_list += [int(node.name)]
            else:
                old_sid_list = []

            # filter non-unique and invalid (e.g. "unaligned") sequences


#            if not self.align_utils.is_unique_sequence(old_sid_list, int(sid)):
#                continue

            if parent_name in self.leaf_count:
                self.leaf_count[parent_name] += 1
            else:
                # it'll be the first seq for a node, so init counter with 1
                self.leaf_count[parent_name] = 1

            # all checks succeeded: add the sequence to the tree
            self.add_tree_node(t0, sid, ranks, parent_level)
            seq_ids += [sid]
            added += 1

        print "Total nodes in resulting tree: ", added

        self.prune_unifu_nodes(t0)
        return t0, seq_ids
def _split_rcm(rcm, t):
    """
    | a | a | a | a | a | a | a | a |
      |                               |
   startpos                         endpos
      |                               |
     x's startpoint               x's endpoint
    endpos - startpos == number of amino acids in the region
    but the number of break points are one more than the number of amino acids
    """

    chi_sq_vec = np.zeros(t.endpos - t.startpos + 1)
    for x in xrange(t.startpos, t.endpos + 1):
        # from the real start position (which is t.startpos)
        # to the real end position + 1 (which is t.endpos, but in xrange you should specify one past last)
        i11 = float(np.sum(rcm[t.startpos:x, t.startpos:x]))
        i22 = float(np.sum(rcm[x:t.endpos, x:t.endpos]))
        i12 = float(np.sum(rcm[t.startpos:x, x:t.endpos]))
        i21 = i12

        row1 = i11 + i12
        row2 = i21 + i22
        col1 = i11 + i21
        col2 = i12 + i22

        # l1 = x-t.startpos
        # l2 = t.endpos - x

        a = i11 * i22 - i21 * i12
        # print "i11: %1.0f\ti22: %1.0f\ti12 and i21: %1.0f" % (i11, i22, i12)

        n = row1 * row2 * col1 * col2
        if n > 0.0:
            chi_sq_vec[x - t.startpos] = a * a / n
        else:
            chi_sq_vec[x - t.startpos] = 0.0

    # print chi_sq_vec

    # if chi square statistics is 0, return no split
    if np.max(chi_sq_vec) == 0.0:
        return
    else:
        # the split point
        xmax = np.argmax(chi_sq_vec) + t.startpos
        # if x - t.startpos < min_module_length or t.endpos - x < min_module_length:
        #   return

        if xmax - t.startpos > min_module_length:
            # from t.startpos to x - 1
            c = Tree()
            c.node_id = idgen.generate()
            c.name = "%d-%d" % (t.startpos, xmax)
            c.add_feature("startpos", t.startpos)
            c.add_feature("endpos", xmax)
            t.add_child(c)
            _split_rcm(rcm, c)

        if t.endpos - xmax > min_module_length:
            # from x to t.endpos - 1
            c = Tree()
            c.node_id = idgen.generate()
            c.name = "%d-%d" % (xmax, t.endpos)
            c.add_feature("startpos", xmax)
            c.add_feature("endpos", t.endpos)
            t.add_child(c)
            _split_rcm(rcm, c)

    return
Beispiel #21
0
class um_tree:
	def __init__(self, tree):
		self.tree = Tree(tree, format = 1)
		self.tree.resolve_polytomy(default_dist=0.000001, recursive=True)
		self.tree.dist = 0
		self.tree.add_feature("age", 0)
		self.nodes = self.tree.get_descendants()
		internal_node = []
		cnt = 0
		for n in self.nodes:
			node_age = n.get_distance(self.tree)
			n.add_feature("age", node_age)
			if not n.is_leaf():
				n.add_feature("id", cnt)
				cnt = cnt + 1
				internal_node.append(n)
		self.nodes = internal_node
		one_leaf = self.tree.get_farthest_node()[0]
		one_leaf.add_feature("id", cnt+1)
		if one_leaf.is_leaf():
			self.nodes.append(one_leaf)
		self.nodes.sort(key=self.__compare_node)
		self.species_list = []
		self.coa_roots = None


	def __compare_node(self, node):
		return node.age


	def get_waiting_times(self, threshold_node = None, threshold_node_idx = 0):
		wt_list = []
		reach_t = False
		curr_age = 0.0
		curr_spe = 2
		curr_num_coa = 0
		coa_roots = []
		min_brl = 1000
		num_spe = -1
		
		if threshold_node == None:
			threshold_node = self.nodes[threshold_node_idx]
		
		last_coa_num = 0
		tcnt = 0 
		for node in self.nodes:
			num_children = len(node.get_children())
			wt = None
			times = node.age - curr_age
			if times >= 0:
				if times < min_brl and times > 0:
					min_brl = times
				curr_age = node.age
				assert curr_spe >=0
				 
				if reach_t:
					if tcnt == 0:
						last_coa_num = 2
					fnode = node.up
					coa_root = None
					
					idx = 0
					while not fnode.is_root():
						idx = 0 
						for coa_r in coa_roots:
							if coa_r.id == fnode.id:
								coa_root = coa_r
								break
							idx = idx + 1
						
						if coa_root!=None:
							break
						else:
							fnode = fnode.up
							
					wt = waiting_time(length = times, num_coas =curr_num_coa, num_lines = curr_spe)
					
					for coa_r in coa_roots:
						coa = coalescent(num_individual = coa_r.curr_n)
						wt.coas.add_coalescent(coa)
					
					wt.coas.coas_idx = last_coa_num
					wt.num_curr_coa = last_coa_num
					if coa_root == None: #here can be modified to use multiple T
						curr_spe = curr_spe - 1
						curr_num_coa = curr_num_coa + 1
						node.add_feature("curr_n", 2)
						coa_roots.append(node)
						last_coa_num = 2
					else:
						curr_n = coa_root.curr_n
						coa_root.add_feature("curr_n", curr_n + 1)
						last_coa_num = curr_n + 1
					tcnt = tcnt + 1
				else:
					if node.id == threshold_node.id:
						reach_t = True
						tcnt = 0 
						wt = waiting_time(length = times, num_coas = 0, num_lines = curr_spe)
						num_spe = curr_spe
						curr_spe = curr_spe - 1
						curr_num_coa = 2
						node.add_feature("curr_n", 2)
						coa_roots.append(node)
					else:
						wt = waiting_time(length = times, num_coas = 0, num_lines = curr_spe)
						curr_spe = curr_spe + 1
				if times > 0.00000001:
					wt_list.append(wt)
		
		
		for wt in wt_list:
			wt.count_num_lines()
		
		self.species_list = []
		all_coa_leaves = []
		self.coa_roots = coa_roots
		for coa_r in coa_roots:
			leaves = coa_r.get_leaves()
			all_coa_leaves.extend(leaves)
			self.species_list.append(leaves)
		
		all_leaves = self.tree.get_leaves()
		for leaf in all_leaves:
			if leaf not in all_coa_leaves:
				self.species_list.append([leaf])
		
		return wt_list, num_spe


	def show(self, wt_list):
		cnt = 1
		for wt in wt_list:
			print("Waitting interval "+ repr(cnt))
			print(wt)
			cnt = cnt + 1


	def get_species(self):
		sp_list = []
		for sp in self.species_list:
			spe = []
			for taxa in sp:
				spe.append(taxa.name)
			sp_list.append(spe)
		
		all_taxa_name = []
		
		#self.tree.convert_to_ultrametric(tree_length = 1.0, strategy='balanced')
		
		for leaf in self.tree.get_leaves():
			all_taxa_name.append(leaf.name)
		
		
		style0 = NodeStyle()
		style0["fgcolor"] = "#000000"
		#style2["shape"] = "circle"
		style0["vt_line_color"] = "#0000aa"
		style0["hz_line_color"] = "#0000aa"
		style0["vt_line_width"] = 2
		style0["hz_line_width"] = 2
		style0["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
		style0["hz_line_type"] = 0
		style0["size"] = 0
		
		for node in self.tree.get_descendants():
			node.set_style(style0)
			node.img_style["size"] = 0
		self.tree.set_style(style0)
		self.tree.img_style["size"] = 0
		
		
		style1 = NodeStyle()
		style1["fgcolor"] = "#000000"
		#style2["shape"] = "circle"
		style1["vt_line_color"] = "#ff0000"
		style1["hz_line_color"] = "#0000aa"
		style1["vt_line_width"] = 2
		style1["hz_line_width"] = 2
		style1["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
		style1["hz_line_type"] = 0
		style1["size"] = 0
		
		style2 = NodeStyle()
		style2["fgcolor"] = "#0f0f0f"
		#style2["shape"] = "circle"
		style2["vt_line_color"] = "#ff0000"
		style2["hz_line_color"] = "#ff0000"
		style2["vt_line_width"] = 2
		style2["hz_line_width"] = 2
		style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
		style2["hz_line_type"] = 0
		style2["size"] = 0
		
		for node in self.coa_roots:
			node.set_style(style1)
			node.img_style["size"] = 0
			for des in node.get_descendants():
				des.set_style(style2)
				des.img_style["size"] = 0
		
		return [all_taxa_name], sp_list


	def print_species(self):
		cnt = 1
		for sp in self.species_list:
			print("Species " + repr(cnt) + ":")
			cnt = cnt + 1
			taxas = ""
			for taxa in sp:
				taxas = taxas + taxa.name + ", "
			print("	" + taxas[:-1])
	
	
	def output_species(self, taxa_order = []):
		"""taxa_order is a list of taxa names, the paritions will be output as the same order"""
		if len(taxa_order) == 0:
			taxa_order = self.tree.get_leaf_names()
		
		num_taxa = 0
		for sp in self.species_list:
			for taxa in sp:
				num_taxa = num_taxa + 1
		if not len(taxa_order) == num_taxa:
			print("error error, taxa_order != num_taxa!")
			return None, None
		else: 
			partion = [-1] * num_taxa
			cnt = 1
			for sp in self.species_list:
				for taxa in sp:
					idx = taxa_order.index(taxa.name)
					partion[idx] = cnt
				cnt = cnt + 1
			return taxa_order, partion


	def num_lineages(self, wt_list):
		nl_list = []
		times = []
		last_time = 0.0
		for wt in wt_list:
			nl_list.append(wt.get_num_branches())
			times.append(last_time)
			last_time = wt.length + last_time
		
		plt.plot(times, nl_list)
		plt.ylabel('Number of lineages')
		plt.xlabel('Time')
		plt.savefig("Time_Lines")
		plt.show()
    def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]):

        print "Number of nodes: %d" % self.taxonomy.seq_count()
        t0 = Tree()
        t0.add_feature("name", "root")
        self.tree_nodes["root"] = t0;
        self.leaf_count["root"] = 0;
        k = 0
        added = 0
        seq_ids = []
        # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1)        
        tax_seq_level = self.taxonomy.max_rank_level() + 1
        for sid, ranks in self.taxonomy.items():
            k += 1
            if k % 1000 == 0:
                print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added

            # filter by minimum rank level            
            if ranks[min_rank] == "":
                continue       
    
            # filter by rank contraints (e.g. class Clostridia only)
            clade_is_ok = False

            # check against the inclusion list            
            if len(clades_to_include) > 0:
                for (rank_level, rank_name) in clades_to_include:            
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = True
                        break
            else: # default: include all
                clade_is_ok = True

            # if sequence is about to be included, check it against the ignore list
            if clade_is_ok:
                for (rank_level, rank_name) in clades_to_ignore:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = False
                        break

            # final decision
            if not clade_is_ok:
                continue

            parent_level = tax_seq_level - 1            
            while ranks[parent_level] == "":
                parent_level -= 1
            parent_name = ranks[parent_level]
            if parent_name in self.tree_nodes:
                parent_node = self.tree_nodes[parent_name]
                # filter by max number of seqs (threshold depends from rank level, 
                # i.e. for genus there can be more seqs than for species)
                max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level)                
                if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank:
                    continue

                old_sid_list = []
                for node in parent_node.children:
                    if node.is_leaf():
                        old_sid_list += [int(node.name)]
            else:
                old_sid_list = []

            # filter non-unique and invalid (e.g. "unaligned") sequences
#            if not self.align_utils.is_unique_sequence(old_sid_list, int(sid)):
#                continue

            if parent_name in self.leaf_count:
                self.leaf_count[parent_name] += 1
            else:
                # it'll be the first seq for a node, so init counter with 1                
                self.leaf_count[parent_name] = 1

            # all checks succeeded: add the sequence to the tree
            self.add_tree_node(t0, sid, ranks, parent_level)
            seq_ids += [sid]
            added += 1

        print "Total nodes in resulting tree: ", added

        self.prune_unifu_nodes(t0)
        return t0, seq_ids
    def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]):

        if self.config.verbose:
            print "Number of nodes: %d" % self.taxonomy.seq_count()
        
        t0 = Tree()
        t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL)
        self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0;
        self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0;
        k = 0
        added = 0
        seq_ids = []
        # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1)        
        tax_seq_level = self.taxonomy.max_rank_level() + 1
        for sid, ranks in self.taxonomy.iteritems():
            k += 1
            if self.config.verbose and k % 1000 == 0:
                print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added

            # filter by minimum rank level            
            if ranks[min_rank] == Taxonomy.EMPTY_RANK:
                continue       
    
            # filter by rank contraints (e.g. class Clostridia only)
            clade_is_ok = False

            # check against the inclusion list            
            if len(clades_to_include) > 0:
                for (rank_level, rank_name) in clades_to_include:            
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = True
                        break
            else: # default: include all
                clade_is_ok = True

            # if sequence is about to be included, check it against the ignore list
            if clade_is_ok:
                for (rank_level, rank_name) in clades_to_ignore:
                    if ranks[rank_level] == rank_name:
                        clade_is_ok = False
                        break

            # final decision
            if not clade_is_ok:
                continue

            parent_level = tax_seq_level - 1            
            while ranks[parent_level] == Taxonomy.EMPTY_RANK:
                parent_level -= 1
            parent_name = Taxonomy.get_rank_uid(ranks, parent_level)
            if parent_name in self.tree_nodes:
                parent_node = self.tree_nodes[parent_name]
                # filter by max number of seqs (threshold depends from rank level, 
                # i.e. for genus there can be more seqs than for species)
                max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level)                
                if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank:
                    continue

            self.leaf_count[parent_name] = self.leaf_count.get(parent_name, 0) + 1

            # all checks succeeded: add the sequence to the tree
            self.add_tree_node(t0, sid, ranks, parent_level)
            seq_ids += [sid]
            added += 1

        if self.config.verbose:
            print "Total nodes in resulting tree: ", added
        
        if self.config.debug:
            reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre")
            t0.write(outfile=reftax_fname, format=8)

        self.prune_unifu_nodes(t0)
        return t0, seq_ids