Example #1
0
    def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix,
                 protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir,
                 bac120_markers, ar122_markers):
        """Initialization."""

        check_dependencies(['hmmalign'])

        self.logger = logging.getLogger('timestamp')

        self.threads = threads
        self.pfam_top_hit_suffix = pfam_top_hit_suffix
        self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix
        self.protein_file_suffix = protein_file_suffix
        self.pfam_hmm_dir = pfam_hmm_dir
        self.tigrfam_hmm_dir = tigrfam_hmm_dir

        self.bac120_markers = bac120_markers
        self.ar122_markers = ar122_markers

        self.marker_path_prefix = {
            "PFAM":
            os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
            "TIGRFAM":
            os.path.join(os.path.dirname(self.tigrfam_hmm_dir),
                         'individual_hmms')
        }

        self.ar122_marker_sizes = None
        self.bac120_marker_sizes = None

        self.version = self.get_version()
Example #2
0
    def identify(self, genomes, tln_tables, out_dir, prefix, force):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        tln_tables: Dict[str, int]
            Genome ID -> translation table mapping for those user-specified.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info('Identifying markers in %d genomes with %d threads.' %
                         (len(genomes), self.cpus))

        self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
        prodigal = Prodigal(self.cpus, self.marker_gene_dir,
                            self.protein_file_suffix, self.nt_gene_file_suffix,
                            self.gff_file_suffix, force)
        self.logger.info("Running Prodigal {} to identify genes.".format(
            prodigal.version))
        genome_dictionary = prodigal.run(genomes, tln_tables)

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.info("Identifying TIGRFAM protein families.")
        gene_files = [
            genome_dictionary[db_genome_id]['aa_gene_path']
            for db_genome_id in genome_dictionary.keys()
        ]

        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, self.marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.info("Identifying Pfam protein families.")
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, self.marker_gene_dir)
        pfam_search.run(gene_files)
        self.logger.info("Annotations done using HMMER {}.".format(
            tigr_search.version))

        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix)
Example #3
0
File: main.py Project: 31380/GTDBTk
    def infer(self, options):
        """Infer a tree from a user specified MSA.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix,
                                                 marker=options.suffix))
            tree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_TREE_LOG.format(prefix=options.prefix,
                                            marker=options.suffix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix,
                                                marker=options.suffix))
        else:
            output_tree = os.path.join(
                options.out_dir,
                PATH_UNROOTED_TREE.format(prefix=options.prefix))
            tree_log = os.path.join(
                options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_FASTTREE_LOG.format(prefix=options.prefix))

        fasttree = FastTree()
        fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model,
                     options.no_support, options.no_gamma, options.msa_file,
                     options.cpus)
        self.logger.info(f'FastTree version: {fasttree.version}')

        if hasattr(options,
                   'subparser_name') and options.subparser_name == 'infer':
            symlink_f(
                output_tree[len(options.out_dir) + 1:],
                os.path.join(options.out_dir, os.path.basename(output_tree)))

        self.logger.info('Done.')
Example #4
0
    def check_dependencies(no_mash):
        """Exits the system if the required programs are not on the path.

        Parameters
        ----------
        no_mash : bool
            True if Mash will be used, False otherwise.
        """
        dependencies = ['fastANI']
        if not no_mash:
            dependencies.append('mash')
        check_dependencies(dependencies)
Example #5
0
    def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix,
                 protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir,
                 bac120_markers, ar122_markers, rps23_markers):
        """Initialization."""

        check_dependencies(['hmmalign'])

        self.threads = threads
        self.pfam_top_hit_suffix = pfam_top_hit_suffix
        self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix
        self.protein_file_suffix = protein_file_suffix
        self.pfam_hmm_dir = pfam_hmm_dir
        self.tigrfam_hmm_dir = tigrfam_hmm_dir

        self.bac120_markers = bac120_markers
        self.ar122_markers = ar122_markers
        self.rps23_markers = rps23_markers
Example #6
0
    def check_install(self):
        """Check that all reference files exist.

        Returns
        -------
        bool
            True if the installation is complete, False otherwise.
        """

        # Check that all programs are on the system path.
        self.logger.info(
            f'Checking that all third-party software are on the system path:')
        names = {
            'prodigal', 'hmmsearch', 'fastANI', 'mash', 'pplacer', 'guppy',
            'FastTree', 'FastTreeMP', 'hmmalign'
        }
        for name in sorted(names):
            on_path = False
            try:
                on_path = on_path or check_dependencies([name],
                                                        exit_on_fail=False)
            except:
                pass
            if on_path:
                self.logger.info("         |-- {:16} {}".format(
                    name, colour('OK', ['bright'], fg='green')))
            else:
                self.logger.info("         |-- {:16} {}".format(
                    name, colour('NOT FOUND', ['bright'], fg='yellow')))

        # Assume this was successful unless otherwise observed.
        ok = True

        # Compute the hash for each directory
        self.logger.info(
            f'Checking integrity of reference package: {Config.GENERIC_PATH}')
        for obj_path, expected_hash in Config.REF_HASHES.items():
            base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path
            base_name = base_name.split('/')[-1]
            user_hash = sha1_dir(obj_path, progress=True)

            if user_hash != expected_hash:
                self.logger.info("         |-- {:16} {}".format(
                    base_name,
                    colour(f'HASH MISMATCH {user_hash}', ['bright'],
                           fg='yellow')))
                ok = False
            else:
                self.logger.info("         |-- {:16} {}".format(
                    base_name, colour('OK', ['bright'], fg='green')))

        if not ok:
            raise GTDBTkExit(
                'Unexpected files were seen, or the reference package is corrupt.'
            )
Example #7
0
    def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix,
                 protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir,
                 bac120_markers, ar122_markers):
        """Initialization."""

        check_dependencies(['hmmalign'])

        self.logger = logging.getLogger('timestamp')

        self.threads = threads
        self.pfam_top_hit_suffix = pfam_top_hit_suffix
        self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix
        self.protein_file_suffix = protein_file_suffix
        self.pfam_hmm_dir = pfam_hmm_dir
        self.tigrfam_hmm_dir = tigrfam_hmm_dir

        self.bac120_markers = bac120_markers
        self.ar122_markers = ar122_markers

        self.version = self.get_version()
Example #8
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        # Stop processing if python 2 is being used.
        if sys.version_info.major < 3:
            raise GTDBTkExit('Python 2 is no longer supported.')

        # Correct user paths
        if hasattr(options, 'out_dir') and options.out_dir:
            options.out_dir = os.path.expanduser(options.out_dir)

        # Assert that the number of CPUs is a positive integer.
        if hasattr(options, 'cpus') and options.cpus < 1:
            self.logger.warning(
                'You cannot use less than 1 CPU, defaulting to 1.')
            options.cpus = 1

        if options.subparser_name == 'de_novo_wf':
            check_dependencies(['prodigal', 'hmmalign'])
            check_dependencies(['FastTree' + ('MP' if options.cpus > 1 else '')])

            options.write_single_copy_genes = False
            self.identify(options)

            options.identify_dir = options.out_dir
            options.skip_trimming = False
            self.align(options)

            if options.bacteria:
                options.suffix = "bac120"
            else:
                options.suffix = "ar122"

            if options.skip_gtdb_refs:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir, PATH_BAC120_USER_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir, PATH_AR122_USER_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown(
                        'Unknown marker set: {}'.format(options.suffix))
            else:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir, PATH_AR122_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown(
                        'Unknown marker set: {}'.format(options.suffix))

            self.infer(options)

            if options.suffix == 'bac120':
                options.input_tree = os.path.join(options.out_dir,
                                                  PATH_BAC120_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(options.out_dir,
                                                   PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix))
            elif options.suffix == 'ar122':
                options.input_tree = os.path.join(options.out_dir,
                                                  PATH_AR122_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(options.out_dir,
                                                   PATH_AR122_ROOTED_TREE.format(prefix=options.prefix))

            self.root(options)

            if options.suffix == 'bac120':
                options.input_tree = os.path.join(options.out_dir,
                                                  PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(options.out_dir,
                                                   PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix))
            elif options.suffix == 'ar122':
                options.input_tree = os.path.join(options.out_dir,
                                                  PATH_AR122_ROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(options.out_dir,
                                                   PATH_AR122_DECORATED_TREE.format(prefix=options.prefix))

            self.decorate(options)

        elif options.subparser_name == 'classify_wf':

            # TODO: Remove this block once the split_tree function is implemented.
            if hasattr(options, 'split_tree') and options.split_tree:
                self.logger.warning('The split tree option is not yet '
                                    ' supported, overriding value to False.')
            options.split_tree = False

            check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy',
                                'fastANI'])

            options.write_single_copy_genes = False
            self.identify(options)

            options.identify_dir = options.out_dir
            options.align_dir = options.out_dir
            options.taxa_filter = None
            options.custom_msa_filters = False
            # Added here due to the other mutex argument being include above.
            options.skip_trimming = False
            options.min_consensus = None
            options.min_perc_taxa = None
            options.skip_gtdb_refs = False
            options.cols_per_gene = None
            options.max_consensus = None
            options.rnd_seed = None
            options.skip_trimming = False
            options.scratch_dir = None
            options.recalculate_red = False

            self.align(options)

            self.classify(options)
        elif options.subparser_name == 'identify':
            self.identify(options)
        elif options.subparser_name == 'align':
            self.align(options)
        elif options.subparser_name == 'infer':
            self.infer(options)
        elif options.subparser_name == 'classify':

            # TODO: Remove this block once the split_tree function is implemented.
            if hasattr(options, 'split_tree') and options.split_tree:
                self.logger.warning('The split tree option is not yet '
                                    ' supported, overriding value to False.')
            options.split_tree = False

            if options.recalculate_red and options.split_tree:
                raise GTDBTkExit('--split_tree and --recalculate_red are mutually exclusive.')
            self.classify(options)
        elif options.subparser_name == 'root':
            self.root(options)
        elif options.subparser_name == 'decorate':
            self.decorate(options)
        elif options.subparser_name == 'infer_ranks':
            self.infer_ranks(options)
        elif options.subparser_name == 'ani_rep':
            self.ani_rep(options)
        elif options.subparser_name == 'trim_msa':
            self.trim_msa(options)
        elif options.subparser_name == 'export_msa':
            self.export_msa(options)
        elif options.subparser_name == 'test':
            check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy',
                                'fastANI'])
            self.run_test(options)
        elif options.subparser_name == 'check_install':
            self.check_install()
        else:
            self.logger.error('Unknown GTDB-Tk command: "' +
                              options.subparser_name + '"\n')
            sys.exit(1)

        return 0
Example #9
0
    def identify(self, genomes, tln_tables, out_dir, prefix, force, genes,
                 write_single_copy_genes):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        tln_tables: Dict[str, int]
            Genome ID -> translation table mapping for those user-specified.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.
        genes : bool
            True if the supplied genomes are called genes, False otherwise.
        write_single_copy_genes : bool
            Write unique AR53/BAC120 marker files to disk.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info(
            f'Identifying markers in {len(genomes):,} genomes with '
            f'{self.cpus} threads.')

        self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
        self.failed_genomes = os.path.join(out_dir,
                                           PATH_FAILS.format(prefix=prefix))

        if not genes:
            prodigal = Prodigal(self.cpus, self.failed_genomes,
                                self.marker_gene_dir, self.protein_file_suffix,
                                self.nt_gene_file_suffix, self.gff_file_suffix,
                                force)
            self.logger.log(
                Config.LOG_TASK,
                f'Running Prodigal {prodigal.version} to identify genes.')
            genome_dictionary = prodigal.run(genomes, tln_tables)

        else:
            self.logger.info(
                'Using supplied genomes as called genes, skipping Prodigal.')
            genome_dictionary = dict()
            for gid, gpath in genomes.items():
                genome_dictionary[gid] = {
                    'aa_gene_path': gpath,
                    'translation_table_path': None,
                    'nt_gene_path': None,
                    'best_translation_table': 'user_supplied',
                    'gff_path': None
                }

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.log(Config.LOG_TASK,
                        'Identifying TIGRFAM protein families.')
        gene_files = [
            genome_dictionary[db_genome_id]['aa_gene_path']
            for db_genome_id in genome_dictionary.keys()
        ]
        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, self.marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.log(Config.LOG_TASK, 'Identifying Pfam protein families.')
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, self.marker_gene_dir)
        pfam_search.run(gene_files)
        self.logger.info(
            f'Annotations done using HMMER {tigr_search.version}.')

        self.logger.log(Config.LOG_TASK,
                        'Summarising identified marker genes.')
        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix, write_single_copy_genes)
Example #10
0
    def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, gamma, msa_file, cpus=1):
        """Run FastTree.

        Parameters
        ----------
        output_tree : str
            The path where the resulting tree should be written to.
        tree_log : str
            The path where the FastTree stats should be written to.
        fasttree_log : str
            The path where the FastTree log should be written to.
        prot_model : str
            Either 'JTT', 'WAG', or 'LG'.
        no_support : bool
            True if no support should be used, False otherwise.
        gamma : bool
            True if Gamma20 should be used, False otherwise.
        msa_file : str
            The path to the input MSA.
        cpus : int
            The maximum number of CPUs for FastTree to use.

        Raises
        ------
        FastTreeException
            If an error is encountered while running FastTree.

        """
        env = os.environ.copy()
        if cpus > 1:
            cmd = 'FastTreeMP'
            env['OMP_NUM_THREADS'] = str(cpus)
        else:
            cmd = 'FastTree'
        check_dependencies([cmd])

        make_sure_path_exists(os.path.dirname(output_tree))
        make_sure_path_exists(os.path.dirname(tree_log))
        make_sure_path_exists(os.path.dirname(fasttree_log))

        # Setup arguments
        args = [cmd]
        model_out = [prot_model]
        if prot_model == 'WAG':
            args.append('-wag')
        elif prot_model == 'LG':
            args.append('-lg')

        if gamma:
            args.append('-gamma')
            model_out.append('+G')

        if no_support:
            args.append('-nosupport')
        else:
            model_out.append('SH support values')

        args.append('-log')
        args.append(tree_log)

        self.logger.info('Inferring FastTree ({}) using a maximum of {} CPUs.'.format(
            ', '.join(model_out), cpus))

        # Use a temporary directory if the input file is gzipped
        with tempfile.TemporaryDirectory(prefix='gtdbtk_') as tmp_dir:

            # Uncompress the archive if it's compressed
            if msa_file.endswith('.gz'):
                msa_path = os.path.join(tmp_dir, os.path.basename(msa_file[0:-3]))
                with gzip.open(msa_file, 'rb') as f_in:
                    with open(msa_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                msa_path = msa_file
            args.append(msa_path)

            with open(output_tree, 'w') as f_out_tree:
                with open(fasttree_log, 'w') as f_out_err:
                    proc = subprocess.Popen(
                        args, stdout=f_out_tree, stderr=f_out_err, env=env)
                    proc.communicate()

        # Validate results
        if proc.returncode != 0:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException('FastTree returned a non-zero exit code.')
        if not os.path.isfile(output_tree):
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is missing: {}'.format(output_tree))
        elif os.path.getsize(output_tree) < 1:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is empty: {}'.format(output_tree))
Example #11
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        if options.subparser_name == 'de_novo_wf':
            check_dependencies(['prodigal', 'hmmalign'])
            check_dependencies(
                ['FastTree' + ('MP' if options.cpus > 1 else '')])

            self.identify(options)

            options.identify_dir = options.out_dir
            options.skip_trimming = False
            self.align(options)

            if options.bac120_ms:
                options.suffix = "bac120"
            else:
                options.suffix = "ar122"

            if options.skip_gtdb_refs:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_BAC120_USER_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_AR122_USER_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown(
                        'Unknown marker set: {}'.format(options.suffix))
            else:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_BAC120_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_AR122_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown(
                        'Unknown marker set: {}'.format(options.suffix))

            self.infer(options)

            if options.suffix == 'bac120':
                options.input_tree = os.path.join(
                    options.out_dir,
                    PATH_BAC120_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(
                    options.out_dir,
                    PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix))
            elif options.suffix == 'ar122':
                options.input_tree = os.path.join(
                    options.out_dir,
                    PATH_AR122_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(
                    options.out_dir,
                    PATH_AR122_ROOTED_TREE.format(prefix=options.prefix))
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown('Unknown marker set: {}'.format(
                    options.suffix))

            self.root(options)
            self.decorate(options)

        elif options.subparser_name == 'classify_wf':
            check_dependencies(
                ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI'])
            self.identify(options)

            options.identify_dir = options.out_dir
            options.align_dir = options.out_dir
            options.taxa_filter = None
            options.custom_msa_filters = False
            options.skip_trimming = False  # Added here due to the other mutex argument being include above.
            options.min_consensus = None
            options.min_perc_taxa = None
            options.skip_gtdb_refs = False
            options.cols_per_gene = None
            options.max_consensus = None
            options.rnd_seed = None
            options.skip_trimming = False
            self.align(options)

            self.classify(options)
        elif options.subparser_name == 'identify':
            self.identify(options)
        elif options.subparser_name == 'align':
            self.align(options)
        elif options.subparser_name == 'infer':
            self.infer(options)
        elif options.subparser_name == 'classify':
            self.classify(options)
        elif options.subparser_name == 'root':
            self.root(options)
        elif options.subparser_name == 'decorate':
            self.decorate(options)
        elif options.subparser_name == 'trim_msa':
            self.trim_msa(options)
        elif options.subparser_name == 'export_msa':
            self.export_msa(options)
        elif options.subparser_name == 'test':
            check_dependencies(
                ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI'])
            self.run_test(options)
        elif options.subparser_name == 'check_install':
            self.check_install()
        else:
            self.logger.error('Unknown GTDB-Tk command: "' +
                              options.subparser_name + '"\n')
            sys.exit(1)

        return 0
Example #12
0
 def check_dependencies(no_mash):
     """Exits the system if the required programs are not on the path."""
     dependencies = ['fastANI']
     if not no_mash:
         dependencies.append('mash')
     check_dependencies(dependencies)
Example #13
0
    def identify(self, genomes, out_dir, prefix, force, genes):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.
        genes : bool
            True if the supplied genomes are called genes, False otherwise.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info('Identifying markers in %d genomes with %d threads.' %
                         (len(genomes), self.cpus))
        marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)

        if not genes:
            self.logger.info("Running Prodigal to identify genes.")
            prodigal = Prodigal(self.cpus, False, marker_gene_dir,
                                self.protein_file_suffix,
                                self.nt_gene_file_suffix, self.gff_file_suffix,
                                force)
            genome_dictionary = prodigal.run(genomes)

        else:
            self.logger.info(
                'Using supplied genomes as called genes, skipping Prodigal.')
            genome_dictionary = dict()
            for gid, gpath in genomes.items():
                genome_dictionary[gid] = {
                    'aa_gene_path': gpath,
                    'translation_table_path': None,
                    'nt_gene_path': None,
                    'best_translation_table': 'user_supplied',
                    'gff_path': None
                }

        gene_files = [(db_genome_id,
                       genome_dictionary[db_genome_id]['aa_gene_path'])
                      for db_genome_id in genome_dictionary.keys()]

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.info("Identifying TIGRFAM protein families.")
        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.info("Identifying Pfam protein families.")
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, marker_gene_dir)
        pfam_search.run(gene_files)

        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix)
Example #14
0
    def run(self,
            output_tree,
            tree_log,
            fasttree_log,
            prot_model,
            no_support,
            no_gamma,
            msa_file,
            cpus=1):
        """Run FastTree.

        Parameters
        ----------
        output_tree : str
            The path where the resulting tree should be written to.
        tree_log : str
            The path where the FastTree stats should be written to.
        fasttree_log : str
            The path where the FastTree log should be written to.
        prot_model : str
            Either 'JTT', 'WAG', or 'LG'.
        no_support : bool
            True if no support should be used, False otherwise.
        no_gamma : bool
            True if no gamma should be used, False otherwise.
        msa_file : str
            The path to the input MSA.
        cpus : int
            The maximum number of CPUs for FastTree to use.

        Raises
        ------
        FastTreeException
            If an error is encountered while running FastTree.

        """
        env = os.environ.copy()
        if cpus > 1:
            cmd = 'FastTreeMP'
            env['OMP_NUM_THREADS'] = str(cpus)
        else:
            cmd = 'FastTree'
        check_dependencies([cmd])

        make_sure_path_exists(os.path.dirname(output_tree))
        make_sure_path_exists(os.path.dirname(tree_log))
        make_sure_path_exists(os.path.dirname(fasttree_log))

        # Setup arguments
        args = [cmd]
        if prot_model == 'WAG':
            args.append('-wag')
        elif prot_model == 'LG':
            args.append('-lg')
        if no_support:
            args.append('-nosupport')
        if not no_gamma:
            args.append('-gamma')
        args.append('-log')
        args.append(tree_log)
        args.append(msa_file)

        model_out = [
            prot_model, ('-' if no_gamma else '+') + 'gamma',
            ('no' if no_support else '') + 'support'
        ]
        self.logger.info(
            'Inferring FastTree ({}) using a maximum of {} CPUs.'.format(
                ', '.join(model_out), cpus))
        self.logger.info('FastTree version: {}'.format(self.version))

        with open(output_tree, 'w') as f_out_tree:
            with open(fasttree_log, 'w') as f_out_err:
                proc = subprocess.Popen(args,
                                        stdout=f_out_tree,
                                        stderr=f_out_err,
                                        env=env)
                proc.communicate()

        # Validate results
        if proc.returncode != 0:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException('FastTree returned a non-zero exit code.')
        if not os.path.isfile(output_tree):
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is missing: {}'.format(output_tree))
        elif os.path.getsize(output_tree) < 1:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is empty: {}'.format(output_tree))