def test_merge_ranks(self): tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map) merge_sids = ["UnpC[Ceti]", "UnpSomer,"] rank_ids = [tax.seq_rank_id(sid) for sid in merge_sids] new_rank_id = tax.merge_ranks(rank_ids) self.assertEqual(merge_sids, tax.get_rank_seqs(new_rank_id)) for sid in merge_sids: self.assertEqual(tax.seq_rank_id(sid), new_rank_id)
def test_normalize_seq_ids(self): tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map) self.assertTrue("UnpC[Ceti]" in tax.seq_ranks_map) self.assertTrue("UnpSomer," in tax.seq_ranks_map) tax.normalize_seq_ids() self.assertFalse("UnpC[Ceti]" in tax.seq_ranks_map) self.assertTrue("UnpC_Ceti_" in tax.seq_ranks_map) self.assertFalse("UnpSomer," in tax.seq_ranks_map) self.assertTrue("UnpSomer_" in tax.seq_ranks_map)
def setUp(self): test_dir = os.path.dirname(os.path.abspath(__file__)) self.tax_fname = os.path.join(test_dir, "test.tax") self.PREFIXED_TAX_DICT = {} with open(self.tax_fname, "w") as outf: for sid, ranks in self.TAX_DICT.iteritems(): outf.write("%s\t%s\n" % (sid, ";".join(ranks))) self.PREFIXED_TAX_DICT[EpacConfig.REF_SEQ_PREFIX+sid] = ranks self.taxonomy = Taxonomy("", self.tax_fname)
def test_subst_ranks(self): testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") tax = Taxonomy("", tax_fname) old_ranks = tax.get_seq_ranks("WgeSangu") self.assertEqual(old_ranks[-2], 'Sneathia') syn_map = {'Sneathia' : 'Sebaldella'} tax.subst_synonyms(syn_map) new_ranks = tax.get_seq_ranks("WgeSangu") self.assertEqual(old_ranks[-2], 'Sebaldella')
def load_refjson(self, refjson_fname): try: self.refjson = RefJsonParser(refjson_fname) except ValueError: self.cfg.exit_user_error("ERROR: Invalid json file format!") #validate input json format (valid, err) = self.refjson.validate() if not valid: self.cfg.log.error( "ERROR: Parsing reference JSON file failed:\n%s", err) self.cfg.exit_user_error() self.rate = self.refjson.get_rate() self.node_height = self.refjson.get_node_height() self.origin_taxonomy = self.refjson.get_origin_taxonomy() self.tax_tree = self.refjson.get_tax_tree() self.cfg.compress_patterns = self.refjson.get_pattern_compression() self.bid_taxonomy_map = self.refjson.get_branch_tax_map() if not self.bid_taxonomy_map: # old file format (before 1.6), need to rebuild this map from scratch th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(self.tax_tree) th.set_bf_unrooted_tree(self.refjson.get_reftree()) self.bid_taxonomy_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(self.bid_taxonomy_map, final=False) reftree_str = self.refjson.get_raxml_readable_tree() self.reftree = Tree(reftree_str) self.reftree_size = len(self.reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, self.rate, self.node_height) self.taxtree_helper = TaxTreeHelper(self.cfg, self.origin_taxonomy, self.tax_tree) tax_code_name = self.refjson.get_taxcode() self.tax_code = TaxCode(tax_code_name) self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX, tax_map=self.origin_taxonomy) self.tax_common_ranks = self.taxonomy.get_common_ranks() # print "Common ranks: ", self.tax_common_ranks self.mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS self.rank_mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
def setUp(self): self.testfile_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "testfiles") self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax") self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname) tax_map = self.taxonomy.get_map() cfg = EpacConfig() self.taxtree_helper = TaxTreeHelper(cfg, tax_map) outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw") self.expected_outgr = Tree(outgr_fname)
def test_taxtree_builder(self): cfg = EpacConfig() testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname) tree_fname = os.path.join(testfile_dir, "taxtree.nw") expected_tree = Tree(tree_fname, format=8) tb = TaxTreeBuilder(cfg, tax) tax_tree, seq_ids = tb.build() self.assertEqual(seq_ids, tax.get_map().keys()) self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
def test_normalize_rank_names(self): tax = Taxonomy(tax_map=self.taxonomy.seq_ranks_map) ranks = tax.get_seq_ranks("UpbRectu") self.assertEqual(ranks[0], "[Bacteria]") self.assertEqual(ranks[1], "'Firmicutes'") self.assertEqual(ranks[2], "Clostridia(1)") corr_ranks = tax.normalize_rank_names() self.assertEqual(len(corr_ranks), 3) ranks = tax.get_seq_ranks("UpbRectu") self.assertEqual(ranks[0], "_Bacteria_") self.assertEqual(ranks[1], "_Firmicutes_") self.assertEqual(ranks[2], "Clostridia_1_")
def setUp(self): cfg = EpacTrainerConfig() cfg.debug = True testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") phy_fname = os.path.join(testfile_dir, "test.phy") tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname) seqs = SeqGroup(sequences=phy_fname, format="phylip") self.inval = InputValidator(cfg, tax, seqs, False) self.expected_mis_ids = ["Missing1", "Missing2"] self.expected_dups = ["DupSeq(01)", "DupSeq02"] self.expected_merges = [ self.inval.taxonomy.seq_rank_id(sid) for sid in self.expected_dups ]
def build_ref_tree(self): self.cfg.log.info("=> Loading taxonomy from file: %s ...\n", self.cfg.taxonomy_fname) self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX, tax_fname=self.cfg.taxonomy_fname) self.cfg.log.info( "==> Loading reference alignment from file: %s ...\n", self.cfg.align_fname) self.load_alignment() self.cfg.log.info("===> Validating taxonomy and alignment ...\n") self.validate_taxonomy() self.cfg.log.info( "====> Building a multifurcating tree from taxonomy with %d seqs ...\n", self.taxonomy.seq_count()) self.build_multif_tree() self.cfg.log.info("=====> Building the reference alignment ...\n") self.export_ref_alignment() self.export_ref_taxonomy() self.cfg.log.info( "======> Saving the outgroup for later re-rooting ...\n") self.save_rooting() self.cfg.log.info( "=======> Resolving multifurcation: choosing the best topology from %d independent RAxML runs ...\n" % self.cfg.rep_num) self.resolve_multif() self.load_reduced_refalign() self.cfg.log.info( "========> Calling RAxML-EPA to obtain branch labels ...\n") self.epa_branch_labeling() self.cfg.log.info( "=========> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n" ) self.epa_post_process() self.calc_node_heights() self.cfg.log.debug("\n==========> Checking branch labels ...") self.cfg.log.debug("shared rank names before training: %s", repr(self.taxonomy.get_common_ranks())) self.cfg.log.debug("shared rank names after training: %s\n", repr(self.mono_index())) self.cfg.log.info("==========> Saving the reference JSON file: %s\n" % self.cfg.refjson_fname) self.write_json()
def test_load(self): self.assertEqual(self.TAX_DICT, self.taxonomy.seq_ranks_map) prefixed_tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname) self.assertEqual(self.PREFIXED_TAX_DICT, prefixed_tax.seq_ranks_map) prefixed_tax = None