Beispiel #1
0
    def identify(self,
                 genomes,
                 out_dir,
                 prefix,
                 force):
        """Identify marker genes in genomes."""

        check_dependencies(['prodigal', 'hmmsearch'])

        try:
            self.logger.info('Identifying markers in %d genomes with %d threads.' % (len(genomes),
                                                                                     self.cpus))

            self.logger.info("Running Prodigal to identify genes.")
            self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
            prodigal = Prodigal(self.cpus,
                                False,
                                self.marker_gene_dir,
                                self.protein_file_suffix,
                                self.nt_gene_file_suffix,
                                self.gff_file_suffix,
                                force)
            genome_dictionary = prodigal.run(genomes)

            # annotated genes against TIGRFAM and Pfam databases
            self.logger.info("Identifying TIGRFAM protein families.")
            gene_files = [genome_dictionary[db_genome_id]['aa_gene_path']
                          for db_genome_id in genome_dictionary.keys()]

            tigr_search = TigrfamSearch(self.cpus,
                                        self.tigrfam_hmms,
                                        self.protein_file_suffix,
                                        self.tigrfam_suffix,
                                        self.tigrfam_top_hit_suffix,
                                        self.checksum_suffix,
                                        self.marker_gene_dir)
            tigr_search.run(gene_files)

            self.logger.info("Identifying Pfam protein families.")
            pfam_search = PfamSearch(self.cpus,
                                     self.pfam_hmm_dir,
                                     self.protein_file_suffix,
                                     self.pfam_suffix,
                                     self.pfam_top_hit_suffix,
                                     self.checksum_suffix,
                                     self.marker_gene_dir)
            pfam_search.run(gene_files)

            self._report_identified_marker_genes(
                genome_dictionary, out_dir, self.marker_gene_dir, prefix)

        except IOError as e:
            self.logger.error(str(e))
            self.logger.error("GTDB-Tk has encountered an error.")

        except Exception as e:
            self.logger.error(str(e))
            raise
Beispiel #2
0
    def infer(self, options):
        """Infer tree from MSA."""

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                options.prefix + options.suffix + '.unrooted.tree')
            tree_log = os.path.join(
                options.out_dir, options.prefix + options.suffix + '.tree.log')
            fasttree_log = os.path.join(
                options.out_dir,
                options.prefix + options.suffix + '.fasttree.log')
        else:
            output_tree = os.path.join(options.out_dir,
                                       options.prefix + '.unrooted.tree')
            tree_log = os.path.join(options.out_dir,
                                    options.prefix + '.tree.log')
            fasttree_log = os.path.join(options.out_dir,
                                        options.prefix + '.fasttree.log')

        if options.prot_model == 'JTT':
            model_str = ''
        elif options.prot_model == 'WAG':
            model_str = ' -wag'
        elif options.prot_model == 'LG':
            model_str = ' -lg'

        support_str = ''
        if options.no_support:
            support_str = ' -nosupport'

        gamma_str = ' -gamma'
        if options.no_gamma:
            gamma_str = ''

        cmd = '-quiet%s%s%s -log %s %s > %s 2> %s' % (
            support_str, model_str, gamma_str, tree_log, options.msa_file,
            output_tree, fasttree_log)
        if options.cpus > 1:
            cmd = 'FastTreeMP ' + cmd
        else:
            cmd = 'FastTree ' + cmd
        self.logger.info('Running: %s' % cmd)
        os.system(cmd)

        self.logger.info('Done.')
Beispiel #3
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        if (options.subparser_name == 'de_novo_wf'):
            check_dependencies(['prodigal', 'hmmalign'])
            if (options.cpus > 1):
                check_dependencies(['FastTreeMP'])
            else:
                check_dependencies(['FastTree'])

            self.identify(options)

            options.identify_dir = options.out_dir
            self.align(options)

            if options.bac120_ms:
                options.suffix = ".bac120"
            else:
                options.suffix = ".ar122"

            if options.skip_gtdb_refs:
                options.msa_file = os.path.join(
                    options.out_dir, Config.INTERMEDIATE_RESULTS,
                    options.prefix + options.suffix + ".user_msa.fasta")
            else:
                options.msa_file = os.path.join(
                    options.out_dir, Config.INTERMEDIATE_RESULTS,
                    options.prefix + options.suffix + ".msa.fasta")
            self.infer(options)

            options.input_tree = os.path.join(
                options.out_dir, Config.INTERMEDIATE_RESULTS,
                options.prefix + options.suffix + ".unrooted.tree")
            options.output_tree = os.path.join(
                options.out_dir,
                options.prefix + options.suffix + ".rooted.tree")
            self.root(options)

            self.decorate(options)
        elif (options.subparser_name == 'classify_wf'):
            check_dependencies(
                ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI'])
            self.identify(options)

            options.identify_dir = options.out_dir
            options.align_dir = options.out_dir
            options.taxa_filter = None
            options.custom_msa_filters = False
            options.min_consensus = None
            options.min_perc_taxa = None
            options.skip_gtdb_refs = False
            options.cols_per_gene = None
            options.max_consensus = None
            options.rnd_seed = None
            self.align(options)

            self.classify(options)
        elif (options.subparser_name == 'identify'):
            self.identify(options)
        elif (options.subparser_name == 'align'):
            self.align(options)
        elif (options.subparser_name == 'infer'):
            self.infer(options)
        elif (options.subparser_name == 'classify'):
            self.classify(options)
        elif (options.subparser_name == 'root'):
            self.root(options)
        elif (options.subparser_name == 'decorate'):
            self.decorate(options)
        elif (options.subparser_name == 'trim_msa'):
            self.trim_msa(options)
        elif (options.subparser_name == 'test'):
            self.run_test(options)
        elif (options.subparser_name == 'check_install'):
            self.check_install()
        else:
            self.logger.error('Unknown GTDB-Tk command: "' +
                              options.subparser_name + '"\n')
            sys.exit()

        return 0
Beispiel #4
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        if (options.subparser_name == 'de_novo_wf'):
            check_dependencies(['prodigal', 'hmmalign'])
            if (options.cpus > 1):
                check_dependencies(['FastTreeMP'])
            else:
                check_dependencies(['FastTree'])

            self.identify(options)

            options.identify_dir = options.out_dir
            self.align(options)

            if options.bac120_ms:
                options.suffix = "bac120"
            else:
                options.suffix = "ar122"

            if options.skip_gtdb_refs:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_BAC120_USER_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_AR122_USER_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown
            else:
                if options.suffix == 'bac120':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_BAC120_MSA.format(prefix=options.prefix))
                elif options.suffix == 'ar122':
                    options.msa_file = os.path.join(
                        options.out_dir,
                        PATH_AR122_MSA.format(prefix=options.prefix))
                else:
                    self.logger.error(
                        'There was an error determining the marker set.')
                    raise GenomeMarkerSetUnknown

            self.infer(options)

            if options.suffix == 'bac120':
                options.input_tree = os.path.join(
                    options.out_dir,
                    PATH_BAC120_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(
                    options.out_dir,
                    PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix))
            elif options.suffix == 'ar122':
                options.input_tree = os.path.join(
                    options.out_dir,
                    PATH_AR122_UNROOTED_TREE.format(prefix=options.prefix))
                options.output_tree = os.path.join(
                    options.out_dir,
                    PATH_AR122_ROOTED_TREE.format(prefix=options.prefix))
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown

            self.root(options)
            self.decorate(options)

        elif (options.subparser_name == 'classify_wf'):
            check_dependencies(
                ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI'])
            self.identify(options)

            options.identify_dir = options.out_dir
            options.align_dir = options.out_dir
            options.taxa_filter = None
            options.custom_msa_filters = False
            options.skip_trimming = False  # Added here due to the other mutex argument being include above.
            options.min_consensus = None
            options.min_perc_taxa = None
            options.skip_gtdb_refs = False
            options.cols_per_gene = None
            options.max_consensus = None
            options.rnd_seed = None
            self.align(options)

            self.classify(options)
        elif (options.subparser_name == 'identify'):
            self.identify(options)
        elif (options.subparser_name == 'align'):
            self.align(options)
        elif (options.subparser_name == 'infer'):
            self.infer(options)
        elif (options.subparser_name == 'classify'):
            self.classify(options)
        elif (options.subparser_name == 'root'):
            self.root(options)
        elif (options.subparser_name == 'decorate'):
            self.decorate(options)
        elif (options.subparser_name == 'trim_msa'):
            self.trim_msa(options)
        elif (options.subparser_name == 'export_msa'):
            self.export_msa(options)
        elif (options.subparser_name == 'test'):
            self.run_test(options)
        elif (options.subparser_name == 'check_install'):
            self.check_install()
        else:
            self.logger.error('Unknown GTDB-Tk command: "' +
                              options.subparser_name + '"\n')
            sys.exit()

        return 0
Beispiel #5
0
    def infer(self, options):
        """Infer tree from MSA."""

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
            os.environ['OMP_NUM_THREADS'] = '%d' % options.cpus
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix,
                                                 marker=options.suffix))
            tree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_TREE_LOG.format(prefix=options.prefix,
                                            marker=options.suffix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix,
                                                marker=options.suffix))
        else:
            output_tree = os.path.join(
                options.out_dir,
                PATH_UNROOTED_TREE.format(prefix=options.prefix))
            tree_log = os.path.join(
                options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_FASTTREE_LOG.format(prefix=options.prefix))

        make_sure_path_exists(os.path.dirname(output_tree))
        make_sure_path_exists(os.path.dirname(tree_log))
        make_sure_path_exists(os.path.dirname(fasttree_log))

        if options.prot_model == 'JTT':
            model_str = ''
        elif options.prot_model == 'WAG':
            model_str = ' -wag'
        elif options.prot_model == 'LG':
            model_str = ' -lg'

        support_str = ''
        if options.no_support:
            support_str = ' -nosupport'

        gamma_str = ' -gamma'
        gamma_str_info = '+GAMMA'
        if options.no_gamma:
            gamma_str = ''
            gamma_str_info = ''

        self.logger.info('Inferring tree with FastTree using {}.'.format(
            options.prot_model, gamma_str_info))

        cmd = '-quiet%s%s%s -log %s %s > %s 2> %s' % (
            support_str, model_str, gamma_str, tree_log, options.msa_file,
            output_tree, fasttree_log)
        if options.cpus > 1:
            cmd = 'FastTreeMP ' + cmd
        else:
            cmd = 'FastTree ' + cmd
        self.logger.info('Running: %s' % cmd)
        os.system(cmd)

        self.logger.info('Done.')