Esempio n. 1
0
class TestCli(unittest.TestCase):

    def setUp(self):
        self.identify_dir_reference = os.path.join(os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False
        self.options.genes = False
        self.options.write_single_copy_genes = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42
        self.options.outgroup_taxon = None

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None
        self.options.min_af = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)


    def test_identify(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar53_marker_path = os.path.join(self.options.out_dir,
                                         PATH_AR53_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(os.path.isfile(
            os.path.join(self.options.out_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar53_marker_path))

        results = {}
        with open(ar53_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.full_tree = True
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'classify')
        classify_options.recalculate_red = False
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(classify_options.out_dir,
                                   PATH_AR53_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 20)
        self.assertTrue(infos[1].startswith('d__Archaea'))

        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))
        self.optionparser.remove_intermediate_files(classify_options.out_dir,'classify_wf')
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))


    def test_classify_wf(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        classify_wf_options.recalculate_red = False
        classify_wf_options.full_tree = True
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(classify_wf_options.out_dir,
                                   PATH_AR53_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 20)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR53_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference, path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer')
        infer_options.gamma = False
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(os.path.join(infer_options.out_dir, PATH_TREE_LOG.format(prefix=self.options.prefix)), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(os.path.join(infer_options.out_dir, PATH_UNROOTED_TREE.format(prefix=self.options.prefix)), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar53"
        de_novo_wf_options.gamma = False
        de_novo_wf_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def test_root(self):
        """Test that rooting is successful when called through the CLI"""
        options = argparse.ArgumentParser()
        options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar53.classify.tree'
        options.outgroup_taxon = 'p__Altiarchaeota'
        options.output_tree = os.path.join(self.generic_out_path, 'test.rooted.tree')
        options.custom_taxonomy_file = None
        options.gtdbtk_classification_file = None
        self.optionparser.root(options)
        self.assertTrue(os.path.isfile(options.output_tree))
Esempio n. 2
0
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = os.path.join(
            os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False
        self.options.genes = False
        self.options.write_single_copy_genes = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar122_marker_path = os.path.join(
            self.options.out_dir,
            PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(
            os.path.isfile(
                os.path.join(
                    self.options.out_dir,
                    PATH_BAC120_MARKER_SUMMARY.format(
                        prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar122_marker_path))

        results = {}
        with open(ar122_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        classify_options.recalculate_red = False
        classify_options.split_tree = False
        self.optionparser.classify(classify_options)
        summary_fh = ClassifySummaryFileAR122(classify_options.out_dir,
                                              classify_options.prefix)
        summary_fh.read()
        self.assertEqual(
            'd__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter ruminantium',
            summary_fh.rows['genome_1'].classification)
        self.assertEqual(
            'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365',
            summary_fh.rows['genome_2'].classification)
        self.assertEqual(
            'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365',
            summary_fh.rows['genome_3'].classification)

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.split_tree = False
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        classify_options.recalculate_red = False
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 19)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        classify_wf_options.recalculate_red = False
        classify_wf_options.split_tree = False
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(
            classify_wf_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 19)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        infer_options.gamma = False
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(
                os.path.join(infer_options.out_dir,
                             PATH_TREE_LOG.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(
                os.path.join(
                    infer_options.out_dir,
                    PATH_UNROOTED_TREE.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.gamma = False
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def test_identify_gzipped_genomes(self):
        """ Test that gene calling is successful when using gzipped genomes """
        options = argparse.ArgumentParser()
        options.genome_dir = 'tests/data/genomes_gz/'
        options.cpus = 5
        options.batchfile = None
        options.extension = 'fna.gz'
        options.write_single_copy_genes = False
        options.prefix = 'gtdbtk'
        options.force = None
        options.genes = False
        options.out_dir = self.generic_out_path
        self.optionparser.identify(options)

        self.assertTrue(
            are_files_equal(
                os.path.join(
                    self.identify_dir_reference,
                    PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(
                    self.generic_out_path,
                    PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

        self.assertTrue(
            are_files_equal(
                os.path.join(
                    self.identify_dir_reference,
                    PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(
                    self.generic_out_path,
                    PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

        self.assertTrue(
            are_files_equal(
                os.path.join(self.identify_dir_reference,
                             PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(self.generic_out_path,
                             PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

    def test_root(self):
        """Test that rooting is successful when called through the CLI"""
        options = argparse.ArgumentParser()
        options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar122.classify.tree'
        options.outgroup_taxon = 'p__Altarchaeota'
        options.output_tree = os.path.join(self.generic_out_path,
                                           'test.rooted.tree')
        options.custom_taxonomy_file = None
        options.gtdbtk_classification_file = None
        self.optionparser.root(options)
        self.assertTrue(os.path.isfile(options.output_tree))

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
Esempio n. 3
0
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = 'tests/data/identify_dir_reference/'
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50

        # classify options
        self.options.scratch_dir = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        self.generic_out_path = 'tests/data/results'

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.options.out_dir,
                             'gtdbtk_bac120_markers_summary.tsv')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.options.out_dir,
                             'gtdbtk_ar122_markers_summary.tsv')))

        results = {}
        with open(
                os.path.join(identify_options.out_dir,
                             'gtdbtk_ar122_markers_summary.tsv'), 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)

        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_wf_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_wf_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              'gtdbtk.ar122.user_msa.fasta')
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        self.optionparser.infer(infer_options)
        with open(os.path.join(infer_options.out_dir, 'gtdbtk.tree.log'),
                  'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(os.path.join(infer_options.out_dir, 'gtdbtk.unrooted.tree'),
                  'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
Esempio n. 4
0
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = os.path.join(
            os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = '/tmp/GTDBTk/tests'

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar122_marker_path = os.path.join(
            self.options.out_dir,
            PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(
            os.path.isfile(
                os.path.join(
                    self.options.out_dir,
                    PATH_BAC120_MARKER_SUMMARY.format(
                        prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar122_marker_path))

        results = {}
        with open(ar122_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(
            classify_wf_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(
                os.path.join(infer_options.out_dir,
                             PATH_TREE_LOG.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(
                os.path.join(
                    infer_options.out_dir,
                    PATH_UNROOTED_TREE.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)