Exemple #1
0
 def test_merge_ranks(self):
     tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map)
     merge_sids = ["UnpC[Ceti]", "UnpSomer,"]
     rank_ids = [tax.seq_rank_id(sid) for sid in merge_sids]
     new_rank_id = tax.merge_ranks(rank_ids)
     self.assertEqual(merge_sids, tax.get_rank_seqs(new_rank_id))
     for sid in merge_sids:
         self.assertEqual(tax.seq_rank_id(sid), new_rank_id)
Exemple #2
0
 def test_normalize_seq_ids(self):
     tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map)
     self.assertTrue("UnpC[Ceti]" in tax.seq_ranks_map)
     self.assertTrue("UnpSomer," in tax.seq_ranks_map)
     tax.normalize_seq_ids()
     self.assertFalse("UnpC[Ceti]" in tax.seq_ranks_map)
     self.assertTrue("UnpC_Ceti_" in tax.seq_ranks_map)
     self.assertFalse("UnpSomer," in tax.seq_ranks_map)
     self.assertTrue("UnpSomer_" in tax.seq_ranks_map)
Exemple #3
0
 def setUp(self):
     test_dir = os.path.dirname(os.path.abspath(__file__))
     self.tax_fname = os.path.join(test_dir, "test.tax")
     self.PREFIXED_TAX_DICT = {}
     with open(self.tax_fname, "w") as outf:
         for sid, ranks in self.TAX_DICT.iteritems():
             outf.write("%s\t%s\n" % (sid, ";".join(ranks)))
             self.PREFIXED_TAX_DICT[EpacConfig.REF_SEQ_PREFIX+sid] = ranks
     self.taxonomy = Taxonomy("", self.tax_fname)
Exemple #4
0
 def test_subst_ranks(self):
     testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles")
     tax_fname = os.path.join(testfile_dir, "test.tax")
     tax = Taxonomy("", tax_fname)
     old_ranks = tax.get_seq_ranks("WgeSangu")
     self.assertEqual(old_ranks[-2], 'Sneathia')
     syn_map = {'Sneathia' : 'Sebaldella'}
     tax.subst_synonyms(syn_map)
     new_ranks = tax.get_seq_ranks("WgeSangu")
     self.assertEqual(old_ranks[-2], 'Sebaldella')
Exemple #5
0
    def load_refjson(self, refjson_fname):
        try:
            self.refjson = RefJsonParser(refjson_fname)
        except ValueError:
            self.cfg.exit_user_error("ERROR: Invalid json file format!")

        #validate input json format
        (valid, err) = self.refjson.validate()
        if not valid:
            self.cfg.log.error(
                "ERROR: Parsing reference JSON file failed:\n%s", err)
            self.cfg.exit_user_error()

        self.rate = self.refjson.get_rate()
        self.node_height = self.refjson.get_node_height()
        self.origin_taxonomy = self.refjson.get_origin_taxonomy()
        self.tax_tree = self.refjson.get_tax_tree()
        self.cfg.compress_patterns = self.refjson.get_pattern_compression()

        self.bid_taxonomy_map = self.refjson.get_branch_tax_map()
        if not self.bid_taxonomy_map:
            # old file format (before 1.6), need to rebuild this map from scratch
            th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
            th.set_mf_rooted_tree(self.tax_tree)
            th.set_bf_unrooted_tree(self.refjson.get_reftree())
            self.bid_taxonomy_map = th.get_bid_taxonomy_map()

        self.write_bid_tax_map(self.bid_taxonomy_map, final=False)

        reftree_str = self.refjson.get_raxml_readable_tree()
        self.reftree = Tree(reftree_str)
        self.reftree_size = len(self.reftree.get_leaves())

        # IMPORTANT: set EPA heuristic rate based on tree size!
        self.cfg.resolve_auto_settings(self.reftree_size)
        # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file
        if self.cfg.epa_load_optmod:
            self.cfg.raxml_model = self.refjson.get_ratehet_model()

        self.classify_helper = TaxClassifyHelper(self.cfg,
                                                 self.bid_taxonomy_map,
                                                 self.rate, self.node_height)
        self.taxtree_helper = TaxTreeHelper(self.cfg, self.origin_taxonomy,
                                            self.tax_tree)

        tax_code_name = self.refjson.get_taxcode()
        self.tax_code = TaxCode(tax_code_name)

        self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX,
                                 tax_map=self.origin_taxonomy)
        self.tax_common_ranks = self.taxonomy.get_common_ranks()
        #        print "Common ranks: ", self.tax_common_ranks

        self.mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
        self.rank_mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
Exemple #6
0
    def setUp(self):
        self.testfile_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "testfiles")
        self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax")
        self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname)
        tax_map = self.taxonomy.get_map()
        cfg = EpacConfig()
        self.taxtree_helper = TaxTreeHelper(cfg, tax_map)

        outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw")
        self.expected_outgr = Tree(outgr_fname)
Exemple #7
0
 def test_taxtree_builder(self):
     cfg = EpacConfig()
     testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles")
     tax_fname = os.path.join(testfile_dir, "test.tax")
     tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname)
     tree_fname = os.path.join(testfile_dir, "taxtree.nw")
     expected_tree = Tree(tree_fname, format=8)
     tb = TaxTreeBuilder(cfg, tax)
     tax_tree, seq_ids = tb.build()
     self.assertEqual(seq_ids, tax.get_map().keys())
     self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
Exemple #8
0
 def test_normalize_rank_names(self):
     tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map)
     ranks = tax.get_seq_ranks("UpbRectu")
     self.assertEqual(ranks[0], "[Bacteria]")
     self.assertEqual(ranks[1], "'Firmicutes'")
     self.assertEqual(ranks[2], "Clostridia(1)")
     corr_ranks = tax.normalize_rank_names()
     self.assertEqual(len(corr_ranks), 3)
     ranks = tax.get_seq_ranks("UpbRectu")
     self.assertEqual(ranks[0], "_Bacteria_")
     self.assertEqual(ranks[1], "_Firmicutes_")
     self.assertEqual(ranks[2], "Clostridia_1_")
    def setUp(self):
        cfg = EpacTrainerConfig()
        cfg.debug = True
        testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    "testfiles")
        tax_fname = os.path.join(testfile_dir, "test.tax")
        phy_fname = os.path.join(testfile_dir, "test.phy")
        tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname)
        seqs = SeqGroup(sequences=phy_fname, format="phylip")
        self.inval = InputValidator(cfg, tax, seqs, False)

        self.expected_mis_ids = ["Missing1", "Missing2"]
        self.expected_dups = ["DupSeq(01)", "DupSeq02"]
        self.expected_merges = [
            self.inval.taxonomy.seq_rank_id(sid) for sid in self.expected_dups
        ]
Exemple #10
0
    def build_ref_tree(self):
        self.cfg.log.info("=> Loading taxonomy from file: %s ...\n",
                          self.cfg.taxonomy_fname)
        self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX,
                                 tax_fname=self.cfg.taxonomy_fname)
        self.cfg.log.info(
            "==> Loading reference alignment from file: %s ...\n",
            self.cfg.align_fname)
        self.load_alignment()
        self.cfg.log.info("===> Validating taxonomy and alignment ...\n")
        self.validate_taxonomy()
        self.cfg.log.info(
            "====> Building a multifurcating tree from taxonomy with %d seqs ...\n",
            self.taxonomy.seq_count())
        self.build_multif_tree()
        self.cfg.log.info("=====> Building the reference alignment ...\n")
        self.export_ref_alignment()
        self.export_ref_taxonomy()
        self.cfg.log.info(
            "======> Saving the outgroup for later re-rooting ...\n")
        self.save_rooting()
        self.cfg.log.info(
            "=======> Resolving multifurcation: choosing the best topology from %d independent RAxML runs ...\n"
            % self.cfg.rep_num)
        self.resolve_multif()
        self.load_reduced_refalign()
        self.cfg.log.info(
            "========> Calling RAxML-EPA to obtain branch labels ...\n")
        self.epa_branch_labeling()
        self.cfg.log.info(
            "=========> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n"
        )
        self.epa_post_process()
        self.calc_node_heights()

        self.cfg.log.debug("\n==========> Checking branch labels ...")
        self.cfg.log.debug("shared rank names before training: %s",
                           repr(self.taxonomy.get_common_ranks()))
        self.cfg.log.debug("shared rank names after  training: %s\n",
                           repr(self.mono_index()))

        self.cfg.log.info("==========> Saving the reference JSON file: %s\n" %
                          self.cfg.refjson_fname)
        self.write_json()
Exemple #11
0
 def test_load(self):
     self.assertEqual(self.TAX_DICT, self.taxonomy.seq_ranks_map)
     prefixed_tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname)
     self.assertEqual(self.PREFIXED_TAX_DICT, prefixed_tax.seq_ranks_map)
     prefixed_tax = None