Beispiel #1
0
    def get_parent_tip_ranks(self, tax_tree):
        rank_tips = {}
        rank_parent = {}
        for node in tax_tree.traverse("postorder"):
            if node.is_leaf() or node.is_root():
                continue
            tax_path = node.name
            ranks = Taxonomy.split_rank_uid(tax_path)
            rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks)
            if rank_lvl < 2:
                continue
                
            parent_ranks = Taxonomy.split_rank_uid(node.up.name)
            parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks)
            if parent_lvl < 1:
                continue
            
            rank_seqs = node.get_leaf_names()
            rank_size = len(rank_seqs)
            if rank_size < 2 or rank_size > self.reftree_size-4:
                continue

#            print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n"
            rank_tips[tax_path] = node.get_leaf_names()
            rank_parent[tax_path] = parent_ranks
            
        return rank_parent, rank_tips
Beispiel #2
0
    def get_parent_tip_ranks(self, tax_tree):
        rank_tips = {}
        rank_parent = {}
        for node in tax_tree.traverse("postorder"):
            if node.is_leaf() or node.is_root():
                continue
            tax_path = node.name
            ranks = Taxonomy.split_rank_uid(tax_path)
            rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks)
            if rank_lvl < 2:
                continue

            parent_ranks = Taxonomy.split_rank_uid(node.up.name)
            parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks)
            if parent_lvl < 1:
                continue

            rank_seqs = node.get_leaf_names()
            rank_size = len(rank_seqs)
            if rank_size < 2 or rank_size > self.reftree_size - 4:
                continue


#            print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n"
            rank_tips[tax_path] = node.get_leaf_names()
            rank_parent[tax_path] = parent_ranks

        return rank_parent, rank_tips
Beispiel #3
0
    def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws):
        mis_rec = None

        num_common_ranks = len(self.tax_common_ranks)
        orig_rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
        new_rank_level = Taxonomy.lowest_assigned_rank_level(ranks)
        #if new_rank_level < 0 or (new_rank_level < num_common_ranks and orig_rank_level >= num_common_ranks):
        #        if new_rank_level < 0:
        if len(ranks) == 0:
            mis_rec = {}
            mis_rec['name'] = seq_name
            mis_rec['orig_level'] = -1
            mis_rec['real_level'] = 0
            mis_rec['level_name'] = "[NotIngroup]"
            mis_rec['inv_level'] = -1 * mis_rec[
                'real_level']  # just for sorting
            mis_rec['orig_ranks'] = orig_ranks
            mis_rec['ranks'] = []
            mis_rec['lws'] = [1.0]
            mis_rec['conf'] = mis_rec['lws'][0]
        else:
            mislabel_lvl = -1
            min_len = min(len(orig_ranks), len(ranks))
            for rank_lvl in range(min_len):
                if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[
                        rank_lvl] != orig_ranks[rank_lvl]:
                    mislabel_lvl = rank_lvl
                    break

            if mislabel_lvl >= 0:
                real_lvl = self.tax_code.guess_rank_level(
                    orig_ranks, mislabel_lvl)
                mis_rec = {}
                mis_rec['name'] = seq_name
                mis_rec['orig_level'] = mislabel_lvl
                mis_rec['real_level'] = real_lvl
                mis_rec['level_name'] = self.tax_code.rank_level_name(
                    real_lvl)[0]
                mis_rec['inv_level'] = -1 * mis_rec[
                    'real_level']  # just for sorting
                mis_rec['orig_ranks'] = orig_ranks
                mis_rec['ranks'] = ranks
                mis_rec['lws'] = lws
                mis_rec['conf'] = lws[mislabel_lvl]

        if mis_rec:
            self.mislabels.append(mis_rec)

        return mis_rec
    def label_bf_tree_with_ranks(self):
        """labeling inner tree nodes with taxonomic ranks"""
        if not self.bf_rooted_tree:
            raise AssertionError(
                "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!"
            )

        for node in self.bf_rooted_tree.traverse("postorder"):
            if node.is_leaf():
                seq_ranks = self.origin_taxonomy[node.name]
                rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks)
                node.add_feature("rank_level", rank_level)
                node.add_feature("ranks", seq_ranks)
                node.name += "__" + seq_ranks[rank_level]
            else:
                if len(node.children) != 2:
                    raise AssertionError("FATAL ERROR: tree is not bifurcating!")
                lchild = node.children[0]
                rchild = node.children[1]
                rank_level = min(lchild.rank_level, rchild.rank_level)
                while rank_level >= 0 and lchild.ranks[rank_level] != rchild.ranks[rank_level]:
                    rank_level -= 1
                node.add_feature("rank_level", rank_level)
                node_ranks = [Taxonomy.EMPTY_RANK] * 7
                if rank_level >= 0:
                    node_ranks[0 : rank_level + 1] = lchild.ranks[0 : rank_level + 1]
                    node.name = lchild.ranks[rank_level]
                else:
                    node.name = "Undefined"
                    if hasattr(node, "B") and self.cfg.verbose:
                        print "INFO: no taxonomic annotation for branch %s (reason: children belong to different kingdoms)" % node.B

                node.add_feature("ranks", node_ranks)

        self.tax_tree = self.bf_rooted_tree
Beispiel #5
0
    def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws):
        mis_rec = None
        
        num_common_ranks = len(self.tax_common_ranks)
        orig_rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
        new_rank_level = Taxonomy.lowest_assigned_rank_level(ranks)
        #if new_rank_level < 0 or (new_rank_level < num_common_ranks and orig_rank_level >= num_common_ranks):
#        if new_rank_level < 0:
        if len(ranks) == 0:
            mis_rec = {}
            mis_rec['name'] = seq_name
            mis_rec['orig_level'] = -1
            mis_rec['real_level'] = 0
            mis_rec['level_name'] = "[NotIngroup]"
            mis_rec['inv_level'] = -1 * mis_rec['real_level']  # just for sorting
            mis_rec['orig_ranks'] = orig_ranks
            mis_rec['ranks'] = []
            mis_rec['lws'] = [1.0]
            mis_rec['conf'] = mis_rec['lws'][0]
        else:
            mislabel_lvl = -1
            min_len = min(len(orig_ranks),len(ranks))
            for rank_lvl in range(min_len):
                if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]:
                    mislabel_lvl = rank_lvl
                    break

            if mislabel_lvl >= 0:
                real_lvl = self.tax_code.guess_rank_level(orig_ranks, mislabel_lvl)
                mis_rec = {}
                mis_rec['name'] = seq_name
                mis_rec['orig_level'] = mislabel_lvl
                mis_rec['real_level'] = real_lvl
                mis_rec['level_name'] = self.tax_code.rank_level_name(real_lvl)[0]
                mis_rec['inv_level'] = -1 * mis_rec['real_level']  # just for sorting
                mis_rec['orig_ranks'] = orig_ranks
                mis_rec['ranks'] = ranks
                mis_rec['lws'] = lws
                mis_rec['conf'] = lws[mislabel_lvl]
    
        if mis_rec:
            self.mislabels.append(mis_rec)
            
        return mis_rec
Beispiel #6
0
    def run_leave_subtree_out_test(self):
        job_name = self.cfg.subst_name("l1out_rank_%NAME%")
        #        if self.jplace_fname:
        #            jp = EpaJsonParser(self.jplace_fname)
        #        else:

        #create file with subtrees
        rank_parent, rank_tips = self.get_parent_tip_ranks(self.tax_tree)

        subtree_list = list(rank_tips.items())
        if len(subtree_list) == 0:
            return 0

        subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt")
        with open(subtree_list_file, "w") as fout:
            for rank_name, tips in subtree_list:
                fout.write("%s\n" % " ".join(tips))

        jp_list = self.raxml.run_epa(job_name,
                                     self.refalign_fname,
                                     self.reftree_fname,
                                     self.optmod_fname,
                                     mode="l1o_subtree",
                                     subtree_fname=subtree_list_file)

        subtree_count = 0
        for jp in jp_list:
            placements = jp.get_placement()
            for place in placements:
                ranks, lws = self.classify_seq(place)
                tax_path = subtree_list[subtree_count][0]
                orig_ranks = Taxonomy.split_rank_uid(tax_path)
                rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
                rank_prefix = self.tax_code.guess_rank_level_name(
                    orig_ranks, rank_level)[0]
                rank_name = orig_ranks[rank_level]
                if not rank_name.startswith(rank_prefix):
                    rank_name = rank_prefix + rank_name
                parent_ranks = rank_parent[tax_path]
                #                print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n"
                mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks,
                                                     ranks, lws)
                if mis_rec:
                    self.misrank_conf_map[tax_path] = mis_rec['conf']
                subtree_count += 1

        return subtree_count
Beispiel #7
0
    def run_leave_subtree_out_test(self):
        job_name = self.cfg.subst_name("l1out_rank_%NAME%")
#        if self.jplace_fname:
#            jp = EpaJsonParser(self.jplace_fname)
#        else:        

        #create file with subtrees
        rank_parent, rank_tips = get_parent_tip_ranks(self.tax_tree)

        subtree_list = rank_tips.items()
        if len(subtree_list) == 0:
            return 0
            
        subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt")
        with open(subtree_list_file, "w") as fout:
            for rank_name, tips in subtree_list:
                fout.write("%s\n" % " ".join(tips))
        
        jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, 
            mode="l1o_subtree", subtree_fname=subtree_list_file)

        subtree_count = 0
        for jp in jp_list:
            placements = jp.get_placement()
            for place in placements:
                ranks, lws = self.classify_seq(place)
                tax_path = subtree_list[subtree_count][0]
                orig_ranks = Taxonomy.split_rank_uid(tax_path)
                rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
                rank_prefix = self.guess_rank_level_name(orig_ranks, rank_level)[0]
                rank_name = orig_ranks[rank_level]
                if not rank_name.startswith(rank_prefix):
                    rank_name = rank_prefix + rank_name
                parent_ranks = rank_parent[tax_path]
#                print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n"
                mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws)
                if mis_rec:
                    self.misrank_conf_map[tax_path] = mis_rec['conf']
                subtree_count += 1

        return subtree_count    
Beispiel #8
0
    def run_leave_subtree_out_test(self):
        job_name = self.cfg.subst_name("l1out_rank_%NAME%")
#        if self.jplace_fname:
#            jp = EpaJsonParser(self.jplace_fname)
#        else:        

        #create file with subtrees
        rank_tips = {}
        rank_parent = {}
        for node in self.tax_tree.traverse("postorder"):
            if node.is_leaf() or node.is_root():
                continue
            tax_path = node.name
            ranks = Taxonomy.split_rank_uid(tax_path)
            rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks)
            if rank_lvl < 2:
                continue
                
            parent_ranks = Taxonomy.split_rank_uid(node.up.name)
            parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks)
            if parent_lvl < 1:
                continue
            
            rank_seqs = node.get_leaf_names()
            rank_size = len(rank_seqs)
            if rank_size < 2 or rank_size > self.reftree_size-4:
                continue

#            print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n"
            rank_tips[tax_path] = node.get_leaf_names()
            rank_parent[tax_path] = parent_ranks
                
        subtree_list = rank_tips.items()
        
        if len(subtree_list) == 0:
            return 0
            
        subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt")
        with open(subtree_list_file, "w") as fout:
            for rank_name, tips in subtree_list:
                fout.write("%s\n" % " ".join(tips))
        
        jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, 
            mode="l1o_subtree", subtree_fname=subtree_list_file)

        subtree_count = 0
        for jp in jp_list:
            placements = jp.get_placement()
            for place in placements:
                ranks, lws = self.classify_seq(place)
                tax_path = subtree_list[subtree_count][0]
                orig_ranks = Taxonomy.split_rank_uid(tax_path)
                rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
                rank_prefix = self.guess_rank_level_name(orig_ranks, rank_level)[0]
                rank_name = orig_ranks[rank_level]
                if not rank_name.startswith(rank_prefix):
                    rank_name = rank_prefix + rank_name
                parent_ranks = rank_parent[tax_path]
#                print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n"
                mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws)
                if mis_rec:
                    self.misrank_conf_map[tax_path] = mis_rec['conf']
                subtree_count += 1

        return subtree_count    
Beispiel #9
0
    def run_leave_subtree_out_test(self):
        job_name = self.cfg.subst_name("l1out_rank_%NAME%")
        #        if self.jplace_fname:
        #            jp = EpaJsonParser(self.jplace_fname)
        #        else:

        #create file with subtrees
        rank_tips = {}
        rank_parent = {}
        for node in self.tax_tree.traverse("postorder"):
            if node.is_leaf() or node.is_root():
                continue
            tax_path = node.name
            ranks = Taxonomy.split_rank_uid(tax_path)
            rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks)
            if rank_lvl < 2:
                continue

            parent_ranks = Taxonomy.split_rank_uid(node.up.name)
            parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks)
            if parent_lvl < 1:
                continue

            rank_seqs = node.get_leaf_names()
            rank_size = len(rank_seqs)
            if rank_size < 2 or rank_size > self.reftree_size - 4:
                continue


#            print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n"
            rank_tips[tax_path] = node.get_leaf_names()
            rank_parent[tax_path] = parent_ranks

        subtree_list = rank_tips.items()

        if len(subtree_list) == 0:
            return 0

        subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt")
        with open(subtree_list_file, "w") as fout:
            for rank_name, tips in subtree_list:
                fout.write("%s\n" % " ".join(tips))

        jp_list = self.raxml.run_epa(job_name,
                                     self.refalign_fname,
                                     self.reftree_fname,
                                     self.optmod_fname,
                                     mode="l1o_subtree",
                                     subtree_fname=subtree_list_file)

        subtree_count = 0
        for jp in jp_list:
            placements = jp.get_placement()
            for place in placements:
                ranks, lws = self.classify_seq(place)
                tax_path = subtree_list[subtree_count][0]
                orig_ranks = Taxonomy.split_rank_uid(tax_path)
                rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks)
                rank_prefix = self.guess_rank_level_name(
                    orig_ranks, rank_level)[0]
                rank_name = orig_ranks[rank_level]
                if not rank_name.startswith(rank_prefix):
                    rank_name = rank_prefix + rank_name
                parent_ranks = rank_parent[tax_path]
                #                print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n"
                mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks,
                                                     ranks, lws)
                if mis_rec:
                    self.misrank_conf_map[tax_path] = mis_rec['conf']
                subtree_count += 1

        return subtree_count