Example #1
0
 def build_bid_taxonomy_map(self):
     self.bid_taxonomy_map = {}
     self.ranks_set = set([])
     for node in self.tax_tree.traverse("postorder"):
         if not node.is_root() and hasattr(node, "B"):
             parent = node.up
             branch_rdiff = Taxonomy.lowest_assigned_rank_level(node.ranks) - Taxonomy.lowest_assigned_rank_level(
                 parent.ranks
             )
             branch_rank_id = Taxonomy.get_rank_uid(node.ranks)
             branch_len = node.dist
             self.bid_taxonomy_map[node.B] = (branch_rank_id, branch_rdiff, branch_len)
             self.ranks_set.add(branch_rank_id)
Example #2
0
 def build_bid_taxonomy_map(self):
     self.bid_taxonomy_map = {}
     self.ranks_set = set([])
     for node in self.tax_tree.traverse("postorder"):
         if not node.is_root() and hasattr(node, "B"):
             parent = node.up
             branch_rdiff = Taxonomy.lowest_assigned_rank_level(
                 node.ranks) - Taxonomy.lowest_assigned_rank_level(
                     parent.ranks)
             branch_rank_id = Taxonomy.get_rank_uid(node.ranks)
             branch_len = node.dist
             self.bid_taxonomy_map[node.B] = (branch_rank_id, branch_rdiff,
                                              branch_len)
             self.ranks_set.add(branch_rank_id)
Example #3
0
    def strip_missing_ranks(self, ranks):
        rank_level = len(ranks)
        while not Taxonomy.get_rank_uid(
                ranks[0:rank_level]) in self.ranks_set and rank_level > 0:
            rank_level -= 1

        return ranks[0:rank_level]
Example #4
0
    def get_seq_ranks_from_tree(self, seq_name):
        if seq_name not in self.name2taxnode:
            errmsg = "FATAL ERROR: Sequence %s is not found in the taxonomic tree!" % seq_name
            self.cfg.exit_fatal_error(errmsg)

        seq_node = self.name2taxnode[seq_name]
        ranks = Taxonomy.split_rank_uid(seq_node.up.name)
        return ranks
Example #5
0
    def get_seq_ranks_from_tree(self, seq_name):
        if seq_name not in self.name2taxnode:
            errmsg = "FATAL ERROR: Sequence %s is not found in the taxonomic tree!" % seq_name
            self.cfg.exit_fatal_error(errmsg)

        seq_node = self.name2taxnode[seq_name]
        ranks = Taxonomy.split_rank_uid(seq_node.up.name)
        return ranks
Example #6
0
    def label_bf_tree_with_ranks(self):
        """labeling inner tree nodes with taxonomic ranks"""
        if not self.bf_rooted_tree:
            raise AssertionError(
                "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!"
            )

        for node in self.bf_rooted_tree.traverse("postorder"):
            if node.is_leaf():
                seq_ranks = self.origin_taxonomy[node.name]
                rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks)
                node.add_feature("rank_level", rank_level)
                node.add_feature("ranks", seq_ranks)
                node.name += "__" + seq_ranks[rank_level]
            else:
                if len(node.children) != 2:
                    raise AssertionError(
                        "FATAL ERROR: tree is not bifurcating!")
                lchild = node.children[0]
                rchild = node.children[1]
                rank_level = min(lchild.rank_level, rchild.rank_level)
                while rank_level >= 0 and lchild.ranks[
                        rank_level] != rchild.ranks[rank_level]:
                    rank_level -= 1
                node.add_feature("rank_level", rank_level)
                node_ranks = [Taxonomy.EMPTY_RANK] * max(
                    len(lchild.ranks), len(rchild.ranks))
                if rank_level >= 0:
                    node_ranks[0:rank_level + 1] = lchild.ranks[0:rank_level +
                                                                1]
                    node.name = lchild.ranks[rank_level]
                else:
                    node.name = "Undefined"
                    if hasattr(node, "B"):
                        self.cfg.log.debug(
                            "INFO: empty taxonomic annotation for branch %s (child nodes have no common ranks)",
                            node.B)

                node.add_feature("ranks", node_ranks)

        self.tax_tree = self.bf_rooted_tree
        self.init_taxnode_map()
Example #7
0
    def label_bf_tree_with_ranks(self):
        """labeling inner tree nodes with taxonomic ranks"""
        if not self.bf_rooted_tree:
            raise AssertionError(
                "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!"
            )

        for node in self.bf_rooted_tree.traverse("postorder"):
            if node.is_leaf():
                seq_ranks = self.origin_taxonomy[node.name]
                rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks)
                node.add_feature("rank_level", rank_level)
                node.add_feature("ranks", seq_ranks)
                node.name += "__" + seq_ranks[rank_level]
            else:
                if len(node.children) != 2:
                    raise AssertionError("FATAL ERROR: tree is not bifurcating!")
                lchild = node.children[0]
                rchild = node.children[1]
                rank_level = min(lchild.rank_level, rchild.rank_level)
                while rank_level >= 0 and lchild.ranks[rank_level] != rchild.ranks[rank_level]:
                    rank_level -= 1
                node.add_feature("rank_level", rank_level)
                node_ranks = [Taxonomy.EMPTY_RANK] * max(len(lchild.ranks), len(rchild.ranks))
                if rank_level >= 0:
                    node_ranks[0 : rank_level + 1] = lchild.ranks[0 : rank_level + 1]
                    node.name = lchild.ranks[rank_level]
                else:
                    node.name = "Undefined"
                    if hasattr(node, "B"):
                        self.cfg.log.debug(
                            "INFO: empty taxonomic annotation for branch %s (child nodes have no common ranks)", node.B
                        )

                node.add_feature("ranks", node_ranks)

        self.tax_tree = self.bf_rooted_tree
        self.init_taxnode_map()
Example #8
0
    def assign_taxonomy_maxsum(self, edges, minlw):
        """this function sums up all LH-weights for each rank and takes the rank with the max. sum """
        # in EPA result, each placement(=branch) has a "weight"
        # since we are interested in taxonomic placement, we do not care about branch vs. branch comparisons,
        # but only consider rank vs. rank (e. g. G1 S1 vs. G1 S2 vs. G1)
        # Thus we accumulate weights for each rank, there are to measures:
        # "own" weight  = sum of weight of all placements EXACTLY to this rank (e.g. for G1: G1 only)
        # "total" rank  = own rank + own rank of all children (for G1: G1 or G1 S1 or G1 S2)
        rw_own = {}
        rw_total = {}

        ranks = [Taxonomy.EMPTY_RANK]

        for edge in edges:
            br_id = str(edge[0])
            lweight = edge[2]
            lowest_rank = None
            lowest_rank_lvl = None

            if lweight == 0.:
                continue

            # accumulate weight for the current sequence
            br_rank_id, rdiff, brlen = self.bid_taxonomy_map[br_id]
            ranks = Taxonomy.split_rank_uid(br_rank_id)
            for i in range(len(ranks)):
                rank = ranks[i]
                rank_id = Taxonomy.get_rank_uid(ranks, i)
                if rank != Taxonomy.EMPTY_RANK:
                    rw_total[rank_id] = rw_total.get(rank_id, 0) + lweight
                    lowest_rank_lvl = i
                    lowest_rank = rank_id
                else:
                    break

            if lowest_rank:
                if rdiff > 0:
                    # if ranks of 'upper' and 'lower' adjacent nodes of a branch are non-equal, split LHW among them
                    parent_rank = Taxonomy.get_rank_uid(
                        ranks, lowest_rank_lvl - rdiff)
                    rw_own[lowest_rank] = rw_own.get(
                        lowest_rank, 0) + lweight * (1 - self.parent_lhw_coeff)
                    rw_own[parent_rank] = rw_own.get(
                        parent_rank, 0) + lweight * self.parent_lhw_coeff
                    # correct total lhw for all levels between "parent" and "lowest"
                    # NOTE: all those intermediate ranks are in fact indistinguishable, e.g. a family which contains a single genus
                    for r in range(rdiff):
                        interim_rank = Taxonomy.get_rank_uid(
                            ranks, lowest_rank_lvl - r)
                        rw_total[interim_rank] = rw_total.get(
                            interim_rank, 0) - lweight * self.parent_lhw_coeff
                else:
                    rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight
#            else:
#                self.cfg.log.debug("WARNING: no annotation for branch %s", br_id)

# if all branches have empty ranks only, just return this placement
        if len(rw_total) == 0:
            return ranks, [1.] * len(ranks)

        # we assign the sequence to a rank, which has the max "own" weight AND
        # whose "total" weight is greater than a confidence threshold
        max_rw = 0.
        ass_rank_id = None
        for r in rw_own.iterkeys():
            if rw_own[r] > max_rw and rw_total[r] >= minlw:
                ass_rank_id = r
                max_rw = rw_own[r]
        if not ass_rank_id:
            ass_rank_id = max(rw_total.iterkeys(),
                              key=(lambda key: rw_total[key]))

        a_ranks = Taxonomy.split_rank_uid(ass_rank_id)

        # "total" weight is considered as confidence value for now
        a_conf = [0.] * len(a_ranks)
        for i in range(len(a_conf)):
            rank = a_ranks[i]
            if rank != Taxonomy.EMPTY_RANK:
                rank_id = Taxonomy.get_rank_uid(a_ranks, i)
                a_conf[i] = rw_total[rank_id]

        return a_ranks, a_conf
Example #9
0
 def get_branch_ranks(self, br_id):
     br_rec = self.bid_taxonomy_map[br_id]
     br_rank_id = br_rec[0]
     ranks = Taxonomy.split_rank_uid(br_rank_id)
     return ranks
Example #10
0
    def assign_taxonomy_maxsum(self, edges, minlw):
        """this function sums up all LH-weights for each rank and takes the rank with the max. sum """
        # in EPA result, each placement(=branch) has a "weight"
        # since we are interested in taxonomic placement, we do not care about branch vs. branch comparisons,
        # but only consider rank vs. rank (e. g. G1 S1 vs. G1 S2 vs. G1)
        # Thus we accumulate weights for each rank, there are to measures:
        # "own" weight  = sum of weight of all placements EXACTLY to this rank (e.g. for G1: G1 only)
        # "total" rank  = own rank + own rank of all children (for G1: G1 or G1 S1 or G1 S2)
        rw_own = {}
        rw_total = {}

        ranks = [Taxonomy.EMPTY_RANK]

        for edge in edges:
            br_id = str(edge[0])
            lweight = edge[2]
            lowest_rank = None
            lowest_rank_lvl = None

            if lweight == 0.0:
                continue

            # accumulate weight for the current sequence
            br_rank_id, rdiff, brlen = self.bid_taxonomy_map[br_id]
            ranks = Taxonomy.split_rank_uid(br_rank_id)
            for i in range(len(ranks)):
                rank = ranks[i]
                rank_id = Taxonomy.get_rank_uid(ranks, i)
                if rank != Taxonomy.EMPTY_RANK:
                    rw_total[rank_id] = rw_total.get(rank_id, 0) + lweight
                    lowest_rank_lvl = i
                    lowest_rank = rank_id
                else:
                    break

            if lowest_rank:
                if rdiff > 0:
                    # if ranks of 'upper' and 'lower' adjacent nodes of a branch are non-equal, split LHW among them
                    parent_rank = Taxonomy.get_rank_uid(ranks, lowest_rank_lvl - rdiff)
                    rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight * (1 - self.parent_lhw_coeff)
                    rw_own[parent_rank] = rw_own.get(parent_rank, 0) + lweight * self.parent_lhw_coeff
                    # correct total lhw for all levels between "parent" and "lowest"
                    # NOTE: all those intermediate ranks are in fact indistinguishable, e.g. a family which contains a single genus
                    for r in range(rdiff):
                        interim_rank = Taxonomy.get_rank_uid(ranks, lowest_rank_lvl - r)
                        rw_total[interim_rank] = rw_total.get(interim_rank, 0) - lweight * self.parent_lhw_coeff
                else:
                    rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight
        #            else:
        #                self.cfg.log.debug("WARNING: no annotation for branch %s", br_id)

        # if all branches have empty ranks only, just return this placement
        if len(rw_total) == 0:
            return ranks, [1.0] * len(ranks)

        # we assign the sequence to a rank, which has the max "own" weight AND
        # whose "total" weight is greater than a confidence threshold
        max_rw = 0.0
        ass_rank_id = None
        for r in rw_own.iterkeys():
            if rw_own[r] > max_rw and rw_total[r] >= minlw:
                ass_rank_id = r
                max_rw = rw_own[r]
        if not ass_rank_id:
            ass_rank_id = max(rw_total.iterkeys(), key=(lambda key: rw_total[key]))

        a_ranks = Taxonomy.split_rank_uid(ass_rank_id)

        # "total" weight is considered as confidence value for now
        a_conf = [0.0] * len(a_ranks)
        for i in range(len(a_conf)):
            rank = a_ranks[i]
            if rank != Taxonomy.EMPTY_RANK:
                rank_id = Taxonomy.get_rank_uid(a_ranks, i)
                a_conf[i] = rw_total[rank_id]

        return a_ranks, a_conf
Example #11
0
 def get_branch_ranks(self, br_id):
     br_rec = self.bid_taxonomy_map[br_id]
     br_rank_id = br_rec[0]
     ranks = Taxonomy.split_rank_uid(br_rank_id)
     return ranks
Example #12
0
    def strip_missing_ranks(self, ranks):
        rank_level = len(ranks)
        while not Taxonomy.get_rank_uid(ranks[0:rank_level]) in self.ranks_set and rank_level > 0:
            rank_level -= 1

        return ranks[0:rank_level]