Esempio n. 1
0
    def process(self):
        """This is the function that goes through each bin loaded in the class and proecesses them."""
        self.sanity_check()

        filesnpaths.gen_output_directory(self.output_directory)

        self.run.warning(
            "Anvi'o is about to start splitting your bins into individual, self-contained anvi'o profiles. This\
                          is quite a tricky operation, and even if it finishes successfully, you must double check everyting\
                          in the resulting profiles to make sure things worked as expected. Although we are doing our best to\
                          test all these, variation between projects make it impossible to be 100% sure."
        )

        for bin_name in self.bin_names_of_interest:
            b = BinSplitter(bin_name,
                            self.summary,
                            self.args,
                            run=self.run,
                            progress=self.progress)
            b.do_contigs_db()
            b.do_profile_db()

            if self.summary.auxiliary_contigs_data_available:
                b.do_auxiliary_contigs_data()

            if self.summary.auxiliary_profile_data_available:
                b.do_auxiliary_profile_data()

        self.run.info('Num bins processed', len(self.bin_names_of_interest))
        self.run.info("Output directory", self.output_directory)
Esempio n. 2
0
    def check_params(self):
        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists = self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if type(self.min_percent_identity) != float:
            raise ConfigError, "Minimum percent identity value must be of type float :("

        if self.min_percent_identity < 20 or self.min_percent_identity > 100:
            raise ConfigError, "Minimum percent identity must be between 20%% and 100%%. Although your %.2f%% is\
                                pretty cute, too." % self.min_percent_identity


        if len([c for c in self.genomes.values() if 'contigs_db_path' not in c]):
            raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\
                                the anvi'o class Pangenome."

        for genome_name in self.genomes:
            if not os.path.exists(self.genomes[genome_name]['contigs_db_path']):
                raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
            if genome_name in self.internal_genome_names and not os.path.exists(self.genomes[genome_name]['profile_db_path']):
                raise ConfigError, "The profile database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
Esempio n. 3
0
    def check_output_directory(self):
        if os.path.exists(self.output_directory) and not self.delete_output_directory_if_exists:
            raise ConfigError, "AdHocRunGenerator will not work with an existing directory. Please provide a new\
                                path, or use the bool member 'delete_output_directory_if_exists' to overwrite\
                                any existing directory."

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists=self.delete_output_directory_if_exists)
Esempio n. 4
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError, "You can not run profiling without a contigs database. You can create\
                                      one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                      tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists = self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists = self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        meta_values = {'db_type': 'profile',
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'contigs_clustered': self.contigs_shall_be_clustered,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'gene_coverages_computed': self.a_meta['genes_are_called']}
        profile_db.create(meta_values)

        self.progress.end()
Esempio n. 5
0
    def __init__(self,
                 args,
                 client_version,
                 ignore_version=False,
                 mailer=None,
                 run=run,
                 progress=progress):
        self.args = args
        self.orig_args = copy.deepcopy(args)
        self.run = run
        self.progress = progress

        self.mailer = mailer

        self.users_data_dir = args.users_data_dir
        if not os.path.exists(self.users_data_dir):
            self.run.warning(
                'A new directory is being initiated to hold users database ...'
            )
            filesnpaths.gen_output_directory(self.users_data_dir)

        # create an instance from the DB class
        self.users_db_path = os.path.join(self.users_data_dir, 'USERS.db')
        self.users_db = UsersDB(client_version,
                                self.users_db_path,
                                ignore_version=ignore_version,
                                run=self.run,
                                progress=self.progress)
Esempio n. 6
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress
        self.pfam_data_dir = args.pfam_data_dir

        filesnpaths.is_program_exists('hmmpress')

        if self.pfam_data_dir and args.reset:
            raise ConfigError(
                "You are attempting to run Pfam setup on a non-default data directory (%s) using the --reset flag. "
                "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset "
                "directories that have been specified with --pfam-data-dir. If you really want to get rid of this "
                "directory and regenerate it with Pfam data inside, then please remove the directory yourself using "
                "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is "
                "the safest way to handle things." %
                (self.pfam_data_dir, self.pfam_data_dir))

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__),
                                              'data/misc/Pfam')

        filesnpaths.is_output_dir_writable(os.path.dirname(self.pfam_data_dir))

        if not args.reset and not anvio.DEBUG:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.pfam_data_dir,
                                         delete_if_exists=args.reset)

        self.database_url = "http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release"
        self.files = [
            'Pfam-A.hmm.gz', 'Pfam.version.gz', 'Pfam-A.clans.tsv.gz'
        ]
Esempio n. 7
0
    def get_output_file_handle(self, sub_directory = None, prefix = 'output.txt', overwrite = False, within = None):
        if sub_directory:
            output_directory = os.path.join(self.output_directory, sub_directory)
        else:
            output_directory = self.output_directory

        if not os.path.exists(output_directory):
            filesnpaths.gen_output_directory(output_directory)

        if within:
            file_path = os.path.join(output_directory, '%s_%s' % (within, prefix))
        else:
            file_path = os.path.join(output_directory, '%s' % (prefix))

        if os.path.exists(file_path) and not overwrite:
            raise ConfigError, 'get_output_file_handle: well, this file already exists: "%s"' % file_path

        key = prefix.split('.')[0].replace('-', '_')

        if within:
            if not self.summary['files'].has_key(within):
                self.summary['files'][within] = {}
            self.summary['files'][within][key] = file_path[len(self.output_directory):].strip('/')
        else:
            self.summary['files'][key] = file_path[len(self.output_directory):].strip('/')

        return open(file_path, 'w')
Esempio n. 8
0
    def check_params(self):
        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if type(self.min_percent_identity) != float:
            raise ConfigError, "Minimum percent identity value must be of type float :("

        if self.min_percent_identity < 20 or self.min_percent_identity > 100:
            raise ConfigError, "Minimum percent identity must be between 20%% and 100%%. Although your %.2f%% is\
                                pretty cute, too." % self.min_percent_identity

        if len(
            [c for c in self.genomes.values() if 'contigs_db_path' not in c]):
            raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\
                                the anvi'o class Pangenome."

        for genome_name in self.genomes:
            if not os.path.exists(
                    self.genomes[genome_name]['contigs_db_path']):
                raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
            if genome_name in self.internal_genome_names and not os.path.exists(
                    self.genomes[genome_name]['profile_db_path']):
                raise ConfigError, "The profile database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
Esempio n. 9
0
    def process(self):
        """This is the function that goes through each bin loaded in the class and proecesses them."""
        self.sanity_check()

        filesnpaths.gen_output_directory(self.output_directory)

        self.run.warning("Anvi'o is about to start splitting your bins into individual, self-contained anvi'o profiles. This\
                          is quite a tricky operation, and even if it finishes successfully, you must double check everyting\
                          in the resulting profiles to make sure things worked as expected. Although we are doing our best to\
                          test all these, variation between projects make it impossible to be 100% sure.")

        if self.skip_variability_tables:
            self.run.warning("Since you asked so nicely, anvi'o will not migrate variability table data into split profiles.")

        for bin_name in self.bin_names_of_interest:
            b = BinSplitter(bin_name, self.summary, self.args, run=self.run, progress=self.progress)
            b.do_contigs_db()

            if self.summary.p_meta['blank']:
                self.run.warning("It seems your profile database is a blank one. That's fine. Anvi'o assumes that your actual\
                                  intention is to split your contigs database only. This warning message is here to make sure\
                                  you will not be upset when you realize your split profile missing a profile database :(")
            else:
                b.do_profile_db()

                if self.summary.auxiliary_profile_data_available:
                    b.do_auxiliary_profile_data()

        self.run.info('Num bins processed', len(self.bin_names_of_interest))
        self.run.info("Output directory", self.output_directory)
Esempio n. 10
0
    def get_output_file_handle(self,
                               sub_directory=None,
                               prefix='output.txt',
                               overwrite=False,
                               within=None):
        if sub_directory:
            output_directory = os.path.join(self.output_directory,
                                            sub_directory)
        else:
            output_directory = self.output_directory

        if not os.path.exists(output_directory):
            filesnpaths.gen_output_directory(output_directory)

        if within:
            file_path = os.path.join(output_directory,
                                     '%s_%s' % (within, prefix))
        else:
            file_path = os.path.join(output_directory, '%s' % (prefix))

        if os.path.exists(file_path) and not overwrite:
            raise ConfigError, 'get_output_file_handle: well, this file already exists: "%s"' % file_path

        key = prefix.split('.')[0].replace('-', '_')

        if within:
            if not self.summary['files'].has_key(within):
                self.summary['files'][within] = {}
            self.summary['files'][within][key] = file_path[
                len(self.output_directory):].strip('/')
        else:
            self.summary['files'][key] = file_path[len(self.output_directory
                                                       ):].strip('/')

        return open(file_path, 'w')
Esempio n. 11
0
    def __init__(self, args, client_version, ignore_version=False, mailer = None,
                 run = run, progress = progress):
        self.args = args
        self.run = run
        self.progress = progress

        self.users_data_dir = args.users_data_dir
        self.users_db_path = os.path.join(self.users_data_dir, 'USERS.db')
        self.mailer = mailer

        self.version = None

        if not os.path.exists(self.users_data_dir):
            self.run.warning('A new directory is being initiated to hold users database ...')
            filesnpaths.gen_output_directory(self.users_data_dir)
            self.create_self(client_version)

        if not os.path.exists(self.users_db_path):
            self.run.warning('An empty users database is being initiated in an already existing directory ...')
            self.create_self(client_version)

        self.conn = sqlite3.connect(self.users_db_path)
        self.conn.text_factory = str
        self.conn.row_factory = dict_factory

        self.cursor = self.conn.cursor()

        self.version = self.get_version()

        if str(self.version) != str(client_version) and not ignore_version:
            raise ConfigError, "It seems the database '%s' was generated when your client was at version %s,\
                                however, your client now is at version %s. Which means this database file\
                                cannot be used with this client anymore and needs to be upgraded to the\
                                version %s :/"\
                                        % (self.users_db_path, self.version, client_version, client_version)
Esempio n. 12
0
    def check_output_directory(self):
        if os.path.exists(self.output_directory) and not self.delete_output_directory_if_exists:
            raise ConfigError, "AdHocRunGenerator will not work with an existing directory. Please provide a new\
                                path, or use the bool member 'delete_output_directory_if_exists' to overwrite\
                                any existing directory."

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists = self.delete_output_directory_if_exists)
Esempio n. 13
0
    def __init__(self, args, client_version, ignore_version=False, mailer=None, run=run, progress=progress):
        self.args = args
        self.orig_args = copy.deepcopy(args)
        self.run = run
        self.progress = progress

        self.mailer = mailer

        self.users_data_dir = args.users_data_dir
        if not os.path.exists(self.users_data_dir):
            self.run.warning('A new directory is being initiated to hold users database ...')
            filesnpaths.gen_output_directory(self.users_data_dir)

        # create an instance from the DB class
        self.users_db_path = os.path.join(self.users_data_dir, 'USERS.db')
        self.users_db = UsersDB(client_version, self.users_db_path, ignore_version=ignore_version, run=self.run, progress=self.progress)

        # figure out the hostname.
        self.hostname = "localhost"
        if self.args.hostname:
            self.hostname = self.args.hostname
        elif self.args.ip_address:
            self.hostname = self.args.ip_address

        if self.args.port_number:
            self.hostname = '%s:%d' % (self.hostname, self.args.port_number)
Esempio n. 14
0
    def create_bin_dir(self):
        self.progress.update('Creating the output directory ...')

        if not self.output_directory:
            self.progress.end()
            raise ConfigError, 'You caled Bin.create() before setting an output directory. Anvio says "nope, thanks".'

        filesnpaths.gen_output_directory(self.output_directory)
Esempio n. 15
0
    def create_bin_dir(self):
        self.progress.update('Creating the output directory ...')

        if not self.output_directory:
            self.progress.end()
            raise ConfigError, 'You caled Bin.create() before setting an output directory. Anvio says "nope, thanks".'

        filesnpaths.gen_output_directory(self.output_directory)
Esempio n. 16
0
    def get_structures(self):
        """Populate self.template_pdb_dir with template structure PDBs"""

        self.template_pdb_dir = os.path.join(
            self.directory,
            "%s_TEMPLATE_PDBS" % str(self.corresponding_gene_call))
        filesnpaths.gen_output_directory(
            self.template_pdb_dir)  # does nothing if already exists

        pdb_paths = {}
        for code, chain in self.list_of_template_code_and_chain_ids:
            five_letter_id = code + chain
            requested_path = J(self.template_pdb_dir, '%s.pdb' % code)

            if self.use_pdb_db and five_letter_id in self.pdb_db.stored_structure_ids:
                # This chain exists in the external database. Export it and get the path
                path = self.pdb_db.export_pdb(five_letter_id, requested_path)
                source = 'Offline DB'

            elif not self.offline_mode:
                # This chain doesn't exist in an external database, and internet access is assumed.
                # We try and download the protein from the RCSB PDB server. If downloading fails,
                # path is None
                path = utils.download_protein_structure(
                    code,
                    chain=chain,
                    output_path=requested_path,
                    raise_if_fail=False)
                source = 'RCSB PDB Server'

            else:
                # Internet access is not assumed, and the chain wasn't in the external database
                path = None
                source = 'Nowhere'

            self.run.info('%s obtained from' % five_letter_id, source)

            if path:
                pdb_paths[five_letter_id] = path

        # remove templates whose structures are not available
        self.list_of_template_code_and_chain_ids = [
            (code, chain_code)
            for code, chain_code in self.list_of_template_code_and_chain_ids
            if code + chain_code in pdb_paths
        ]

        if not len(self.list_of_template_code_and_chain_ids):
            self.run.warning(
                "No structures of the homologous proteins (templates) were available. Probably something "
                "is wrong. Stopping here.")
            raise self.EndModeller

        self.run.info(
            "Structures obtained for", ", ".join([
                code[0] + code[1]
                for code in self.list_of_template_code_and_chain_ids
            ]))
Esempio n. 17
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Esempio n. 18
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Esempio n. 19
0
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(
            self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError(
                "Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError(
                "Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError(
                "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is "
                "pretty cute, too." % self.min_percent_identity)

        if len(
            [c for c in list(self.genomes.values())
             if 'genome_hash' not in c]):
            raise ConfigError(
                "self.genomes does not seem to be a properly formatted dictionary for "
                "the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering "
                "while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name +
                                                     '-PAN.db')
Esempio n. 20
0
    def load_from_files(self, args):
        if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir):
            raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\
                                           '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\
                                           more detailed information on them."

        if self.view:
            raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/"

        metadata_path = os.path.abspath(self.metadata)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = os.path.abspath(self.output_dir)
        self.p_meta['views'] = {}
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        if self.summary_index:
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)
            self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index'])

        # sanity of the metadata
        filesnpaths.is_file_tab_delimited(metadata_path)
        metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True)
        if not metadata_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the metadata file must\
                                      say 'contig', which is not the case for your metadata file\
                                      ('%s'). Please make sure this is a properly formatted metadata\
                                      file." % (metadata_path)

        # store metadata as view:
        self.views[self.default_view] = {'header': metadata_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # reminder: this is being stored in the output dir provided as a commandline parameter:
        self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp')

        if self.title:
            self.title = self.title

        filesnpaths.gen_output_directory(self.p_meta['output_dir'])
Esempio n. 21
0
    def sanity_check(self):
        filesnpaths.is_file_tab_delimited(self.metadata_file_path)

        if os.path.exists(self.output_directory_path):
            filesnpaths.is_output_dir_writable(self.output_directory_path)
        else:
            filesnpaths.gen_output_directory(self.output_directory_path)

        filesnpaths.is_output_file_writable(self.output_fasta_descriptor)
Esempio n. 22
0
    def check_params(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError("Please set a project name, and be prepared to see it around as (1) anvi'o will use\
                                that name to set the output directory and to name various output files such as the\
                                databases that will be generated at the end of the process. If you set your own output\
                                directory name, you can have multiple projects in it and all of those projects can use\
                                the same intermediate files whenever possible.")

        utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False)

        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.maxbit, float):
            raise ConfigError("maxbit value must be of type float :(")

        if self.maxbit < 0 or self.maxbit > 1:
            raise ConfigError("Well. maxbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
Esempio n. 23
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError, "You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(
            self.output_directory,
            self.progress,
            delete_if_exists=self.overwrite_output_destinations)

        self.progress.update(
            'Creating a new single profile database with contigs hash "%s" ...'
            % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_AA_frequencies = False

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': self.sample_id,
            'merged': False,
            'blank': self.blank,
            'contigs_clustered': self.contigs_shall_be_clustered,
            'default_view': 'single',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': not self.skip_SNV_profiling,
            'AA_frequencies_profiled': self.profile_AA_frequencies,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.a_meta['contigs_db_hash'],
            'gene_coverages_computed': self.a_meta['genes_are_called']
        }
        profile_db.create(meta_values)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning(
                'Single-nucleotide variation will not be characterized for this profile.'
            )

        if not self.profile_AA_frequencies:
            self.run.warning(
                'Amino acid linkmer frequencies will not be characterized for this profile.'
            )
Esempio n. 24
0
    def __init__(self,
                 bin_name,
                 summary_object,
                 args,
                 run=run,
                 progress=progress):
        """A class to split a single bin from its parent.

        The class is not really useful without a summary object, but it makes logistic sense to keep it
        separate since the inheritance from anvio/summarizer.Bin is much easier and sane this way."""
        summarizer.Bin.__init__(self, summary_object, bin_name, run, progress)

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.output_directory = A('output_dir')
        self.skip_variability_tables = A('skip_variability_tables')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A(
            'enforce_hierarchical_clustering')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.compress_auxiliary_data = A('compress_auxiliary_data')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)
        self.clustering_configs = constants.clustering_configs['merged']
        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        # set the output directory, and output file paths
        self.bin_output_directory = os.path.join(self.output_directory,
                                                 bin_name)
        filesnpaths.gen_output_directory(self.bin_output_directory)

        # let's see whether we are going to do any hierarchical clustering:
        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
        self.skip_hierarchical_clustering = self.is_hierarchical_clustering_for_bin_OK(
        )

        # set your own db paths
        self.bin_contigs_db_path = os.path.join(self.bin_output_directory,
                                                'CONTIGS.db')
        self.bin_profile_db_path = os.path.join(self.bin_output_directory,
                                                'PROFILE.db')
Esempio n. 25
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        """Setup a Pfam database for anvi'o

        Parameters
        ==========
        args : argparse.Namespace
            See `bin/anvi-setup-interacdome` for available arguments
        """

        self.run = run
        self.progress = progress
        self.interacdome_data_dir = args.interacdome_data_dir

        self.pfam_setup = None

        self.interacdome_files = {
            # NOTE These are mirror links to the InteracDome dataset taken from
            # https://interacdome.princeton.edu/ on July 21st, tagged as v0.3. The reason for this
            # was to create a static, permanent link
            'representable_interactions.txt':
            'https://ndownloader.figshare.com/files/24019757',
            'confident_interactions.txt':
            'https://ndownloader.figshare.com/files/24019694',
        }

        if self.interacdome_data_dir and args.reset:
            raise ConfigError(
                "You are attempting to run InteracDome setup on a non-default data directory (%s) using the --reset flag. "
                "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset "
                "directories that have been specified with --interacdome-data-dir. If you really want to get rid of this "
                "directory and regenerate it with InteracDome data inside, then please remove the directory yourself using "
                "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is "
                "the safest way to handle things." %
                (self.interacdome_data_dir, self.interacdome_data_dir))

        if not self.interacdome_data_dir:
            self.interacdome_data_dir = constants.default_interacdome_data_path

        self.run.warning('', header='Setting up InteracDome', lc='yellow')
        self.run.info('Data directory', self.interacdome_data_dir)
        self.run.info('Reset contents', args.reset)

        filesnpaths.is_output_dir_writable(
            os.path.dirname(os.path.abspath(self.interacdome_data_dir)))

        if not args.reset and not anvio.DEBUG:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.interacdome_data_dir,
                                         delete_if_exists=args.reset)
Esempio n. 26
0
    def init_dirs_and_dbs(self):
        if not self.annotation_db_path:
            raise ConfigError, "You can not run profiling without an annotation database. You can create\
                                      one using 'anvi-gen-annotation-database'. Not sure how? Please see the\
                                      user manual."

        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory or self.input_file_path + "-ANVIO_PROFILE",
            ok_if_exists=self.overwrite_output_destinations,
        )

        self.progress.new("Initializing")

        self.progress.update("Creating the output directory ...")
        filesnpaths.gen_output_directory(
            self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations
        )

        self.progress.update("Initializing the annotation database ...")
        annotation_db = dbops.AnnotationDatabase(self.annotation_db_path)
        self.split_length = int(annotation_db.meta["split_length"])
        self.annotation_hash = annotation_db.meta["annotation_hash"]
        self.contig_names_in_annotation_db = set(
            annotation_db.db.get_table_as_dict(t.contigs_info_table_name, string_the_key=True).keys()
        )
        annotation_db.disconnect()

        self.progress.update(
            'Creating a new single profile database with annotation hash "%s" ...' % self.annotation_hash
        )
        self.profile_db_path = self.generate_output_destination("PROFILE.db")
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        meta_values = {
            "db_type": "profile",
            "sample_id": self.sample_id,
            "samples": self.sample_id,
            "merged": False,
            "contigs_clustered": self.contigs_shall_be_clustered,
            "min_coverage_for_variability": self.min_coverage_for_variability,
            "default_view": "single",
            "min_contig_length": self.min_contig_length,
            "report_variability_full": self.report_variability_full,
            "annotation_hash": self.annotation_hash,
        }
        profile_db.create(meta_values)

        self.progress.end()
Esempio n. 27
0
    def sanity_check(self, skip_warnings=False):
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError(
                    "You cannot give MODELLER a non-empty directory to work in."
                )
        else:
            filesnpaths.gen_output_directory(self.directory)

        if not self.lazy_init:
            self.executable = check_MODELLER(self.executable)

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError(
                "MODELLER :: The input FASTA file must have exactly one sequence. "
                "You provided one with {}.".format(target_fasta.total_seq))
        try:
            while next(target_fasta):
                int(target_fasta.id)
        except:
            raise ConfigError(
                "MODELLER :: The defline of this fasta file must be an integer"
            )
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning(
                "You realize that deviation is given in angstroms, right? You chose {}"
                .format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.num_models = 1
            self.run.warning(
                "Since you chose --very-fast, there will be little difference, if at all, between models. Anvi'o "
                "authoritatively sets --num-models to 1 to save you time.")
Esempio n. 28
0
    def merge_split_summaries(self):
        merged_summary_index = {}
        merged_summary_index_path = os.path.join(self.output_directory, 'SUMMARY.cp')
        summary_dir = filesnpaths.gen_output_directory(os.path.join(self.output_directory, 'SUMMARY'), delete_if_exists = True)


        # read all index files per run into a dict here, so the access is easier from within
        # the for loop below
        run_sum_indices = {}
        for runinfo  in self.input_runinfo_dicts.values():
            run_sum_indices[runinfo['sample_id']] = dictio.read_serialized_object(runinfo['profile_summary_index'])

        for i in range(0, len(self.split_names)):
            self.progress.update('merging summaries for splits %s of %s' % (i + 1, len(self.split_names)))
            split_name = self.split_names[i]

            merged_summary = {}
            for runinfo in self.input_runinfo_dicts.values(): 
                run_split_summary = dictio.read_serialized_object(os.path.join(runinfo['input_dir'], run_sum_indices[runinfo['sample_id']][split_name]))
                merged_summary[runinfo['sample_id']] = run_split_summary[runinfo['sample_id']]

            merged_split_summary_path = os.path.join(summary_dir, os.path.basename(run_sum_indices[runinfo['sample_id']][split_name]))
            dictio.write_serialized_object(merged_summary, merged_split_summary_path)
            merged_summary_index[split_name] = merged_split_summary_path

        self.progress.update('Serializing merged split summary index ...')
        dictio.write_serialized_object(dictio.strip_prefix_from_dict_values(merged_summary_index, self.output_directory),\
                                           merged_summary_index_path)

        return summary_dir, merged_summary_index_path
Esempio n. 29
0
    def __init__(self, args, client_version, ignore_version=False, mailer = None, run = run, progress = progress):
        self.args = args
        self.orig_args = copy.deepcopy(args)
        self.run = run
        self.progress = progress

        self.mailer = mailer

        self.users_data_dir = args.users_data_dir
        if not os.path.exists(self.users_data_dir):
            self.run.warning('A new directory is being initiated to hold users database ...')
            filesnpaths.gen_output_directory(self.users_data_dir)

        # create an instance from the DB class
        self.users_db_path = os.path.join(self.users_data_dir, 'USERS.db')
        self.users_db = UsersDB(client_version, self.users_db_path, ignore_version = ignore_version, run = self.run, progress = self.progress)
Esempio n. 30
0
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError("Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
Esempio n. 31
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress
        self.pfam_data_dir = args.pfam_data_dir

        filesnpaths.is_program_exists('hmmpress')

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam')

        if not args.reset:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.pfam_data_dir, delete_if_exists=args.reset)

        self.database_url = "http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release"
        self.files = ['Pfam-A.hmm.gz', 'Pfam.version.gz', 'Pfam-A.clans.tsv.gz']
Esempio n. 32
0
    def generate_pages_for_artifacts(self):
        """Generates static pages for artifacts in the output directory"""

        self.progress.new("Rendering artifact pages",
                          progress_total_items=len(ANVIO_ARTIFACTS))
        self.progress.update('...')

        for artifact in ANVIO_ARTIFACTS:
            self.progress.update(f"'{artifact}' ...", increment=True)

            d = {
                'artifact': ANVIO_ARTIFACTS[artifact],
                'meta': {
                    'summary_type':
                    'artifact',
                    'version':
                    '\n'.join([
                        '|%s|%s|' % (t[0], t[1])
                        for t in anvio.get_version_tuples()
                    ]),
                    'date':
                    utils.get_date(),
                    'version_short_identifier':
                    self.version_short_identifier
                }
            }

            d['artifact']['name'] = artifact
            d['artifact']['required_by'] = [
                (r, '../../programs/%s' % r)
                for r in self.artifacts_info[artifact]['required_by']
            ]
            d['artifact']['provided_by'] = [
                (r, '../../programs/%s' % r)
                for r in self.artifacts_info[artifact]['provided_by']
            ]
            d['artifact']['description'] = self.artifacts_info[artifact][
                'description']
            d['artifact'][
                'icon'] = '../../images/icons/%s.png' % ANVIO_ARTIFACTS[
                    artifact]['type']

            if anvio.DEBUG:
                self.progress.reset()
                run.warning(None, 'THE OUTPUT DICT')
                import json
                print(json.dumps(d, indent=2))

            self.progress.update(f"'{artifact}' ... rendering ...",
                                 increment=False)
            artifact_output_dir = filesnpaths.gen_output_directory(
                os.path.join(self.artifacts_output_dir, artifact))
            output_file_path = os.path.join(artifact_output_dir, 'index.md')
            open(output_file_path,
                 'w').write(SummaryHTMLOutput(d, r=run, p=progress).render())

        self.progress.end()
Esempio n. 33
0
    def __init__(self, args=None, r=run, p=progress):
        self.summary = {}

        self.debug = False
        self.quick = False
        self.profile_db_path = None
        self.contigs_db_path = None
        self.output_directory = None
        self.split_names_per_bin = None
        self.completeness_data_available = False
        self.gene_coverages_data_available = False
        self.non_single_copy_gene_hmm_data_available = False

        self.run = r
        self.progress = p

        DatabasesMetaclass.__init__(self, args, self.run, self.progress)

        # databases initiated, let's make sure we have gene covereges data avaialable.
        if self.gene_coverages_dict:
            self.gene_coverages_data_available = True

        self.collections = ccollections.Collections()
        self.collections.populate_collections_dict(self.contigs_db_path,
                                                   anvio.__contigs__version__)
        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        self.collection_name = None

        if args:
            if args.list_collections:
                self.collections.list_collections()
                sys.exit()

            self.collection_name = args.collection_name
            self.output_directory = args.output_dir
            self.quick = args.quick_summary
            self.debug = args.debug

        self.sanity_check()

        filesnpaths.gen_output_directory(self.output_directory,
                                         delete_if_exists=True)
Esempio n. 34
0
    def __init__(self, args=None, r=run, p=progress):
        self.summary = {}

        self.debug = False
        self.quick = False
        self.profile_db_path = None
        self.contigs_db_path = None
        self.output_directory = None
        self.split_names_per_bin = None
        self.completeness_data_available = False
        self.gene_coverages_data_available = False
        self.non_single_copy_gene_hmm_data_available = False

        self.run = r
        self.progress = p

        DatabasesMetaclass.__init__(self, args, self.run, self.progress)

        # databases initiated, let's make sure we have gene covereges data avaialable.
        if self.gene_coverages_dict:
            self.gene_coverages_data_available = True

        self.collections = ccollections.Collections()
        self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__)
        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        self.collection_name = None

        if args:
            if args.list_collections:
                self.collections.list_collections()
                sys.exit()

            self.collection_name = args.collection_name
            self.output_directory = args.output_dir
            self.quick = args.quick_summary
            self.debug = args.debug
            self.taxonomic_level = args.taxonomic_level

        self.sanity_check()

        self.init_splits_taxonomy(self.taxonomic_level)

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists=True)
Esempio n. 35
0
    def __init__(self, args, r=terminal.Run(), p=terminal.Progress()):
        self.args = args
        self.run = r
        self.progress = p

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.output_directory_path = A("output_dir") or 'ANVIO-HELP'

        if not os.path.exists(anvio.DOCS_PATH):
            raise ConfigError(
                "The anvi'o docs path is not where it should be :/ Something funny is going on."
            )

        filesnpaths.gen_output_directory(self.output_directory_path,
                                         delete_if_exists=True,
                                         dont_warn=True)

        self.artifacts_output_dir = filesnpaths.gen_output_directory(
            os.path.join(self.output_directory_path, 'artifacts'))
        self.programs_output_dir = filesnpaths.gen_output_directory(
            os.path.join(self.output_directory_path, 'programs'))

        self.version_short_identifier = 'm' if anvio.anvio_version_for_help_docs == 'main' else anvio.anvio_version_for_help_docs
        self.base_url = os.path.join("/help",
                                     anvio.anvio_version_for_help_docs)
        self.anvio_markdown_variables_conversion_dict = {}

        AnvioPrograms.__init__(self, args, r=self.run, p=self.progress)
        self.init_programs()

        AnvioArtifacts.__init__(self, args, r=self.run, p=self.progress)
        self.init_artifacts()

        if not len(self.programs):
            raise ConfigError(
                "AnvioDocs is asked ot process the usage statements of some programs, but the "
                "`self.programs` dictionary seems to be empty :/")

        self.images_source_directory = os.path.join(
            os.path.dirname(anvio.__file__), 'docs/images/png')

        self.sanity_check()
Esempio n. 36
0
    def process(self):
        """This is the function that goes through each bin loaded in the class and proecesses them."""
        self.sanity_check()

        filesnpaths.gen_output_directory(self.output_directory)

        self.run.warning(
            "Anvi'o is about to start splitting your bins into individual, self-contained anvi'o profiles. This\
                          is quite a tricky operation, and even if it finishes successfully, you must double check everyting\
                          in the resulting profiles to make sure things worked as expected. Although we are doing our best to\
                          test all these, variation between projects make it impossible to be 100% sure."
        )

        if self.skip_variability_tables:
            self.run.warning(
                "Since you asked so nicely, anvi'o will not migrate variability table data into split profiles."
            )

        for bin_name in self.bin_names_of_interest:
            b = BinSplitter(bin_name,
                            self.summary,
                            self.args,
                            run=self.run,
                            progress=self.progress)
            b.do_contigs_db()

            if self.summary.p_meta['blank']:
                self.run.warning(
                    "It seems your profile database is a blank one. That's fine. Anvi'o assumes that your actual\
                                  intention is to split your contigs database only. This warning message is here to make sure\
                                  you will not be upset when you realize your split profile missing a profile database :("
                )
            else:
                b.do_profile_db()

                if self.summary.auxiliary_profile_data_available:
                    b.do_auxiliary_profile_data()

        self.run.info('Num bins processed', len(self.bin_names_of_interest))
        self.run.info("Output directory", self.output_directory)
Esempio n. 37
0
    def generate_pages_for_programs(self):
        """Generates static pages for programs in the output directory"""

        program_provides_requires_dict = self.get_program_requires_provides_dict(
        )

        for program_name in self.programs:
            program = self.programs[program_name]
            d = {
                'program': {},
                'meta': {
                    'summary_type':
                    'program',
                    'version':
                    '\n'.join([
                        '|%s|%s|' % (t[0], t[1])
                        for t in anvio.get_version_tuples()
                    ]),
                    'date':
                    utils.get_date()
                }
            }

            d['program']['name'] = program_name
            d['program']['usage'] = program.usage
            d['program']['description'] = program.meta_info['description'][
                'value']
            d['program']['resources'] = program.meta_info['resources']['value']
            d['program']['requires'] = program_provides_requires_dict[
                program_name]['requires']
            d['program']['provides'] = program_provides_requires_dict[
                program_name]['provides']
            d['program']['icon'] = '../../images/icons/%s.png' % 'PROGRAM'
            d['artifacts'] = self.artifacts_info

            if anvio.DEBUG:
                run.warning(None, 'THE OUTPUT DICT')
                import json
                print(json.dumps(d, indent=2))

            program_output_dir = filesnpaths.gen_output_directory(
                os.path.join(self.programs_output_dir, program_name))
            output_file_path = os.path.join(program_output_dir, 'index.md')
            open(output_file_path,
                 'w').write(SummaryHTMLOutput(d, r=run, p=progress).render())

            # create the program network, too
            program_network = ProgramsNetwork(argparse.Namespace(
                output_file=os.path.join(program_output_dir, "network.json"),
                program_names_to_focus=program_name),
                                              r=terminal.Run(verbose=False))
            program_network.generate()
Esempio n. 38
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress
        self.pfam_data_dir = args.pfam_data_dir

        filesnpaths.is_program_exists('hmmpress')

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__),
                                              'data/misc/Pfam')

        if not args.reset:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.pfam_data_dir,
                                         delete_if_exists=args.reset)

        self.database_url = "http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release"
        self.files = [
            'Pfam-A.hmm.gz', 'Pfam.version.gz', 'Pfam-A.clans.tsv.gz'
        ]
Esempio n. 39
0
    def __init__(self, bin_name, summary_object, args, run=run, progress=progress):
        """A class to split a single bin from its parent.

        The class is not really useful without a summary object, but it makes logistic sense to keep it
        separate since the inheritance from anvio/summarizer.Bin is much easier and sane this way."""
        summarizer.Bin.__init__(self, summary_object, bin_name, run, progress)

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.output_directory = A('output_dir')
        self.skip_variability_tables = A('skip_variability_tables')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A('enforce_hierarchical_clustering')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.compress_auxiliary_data = A('compress_auxiliary_data')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)
        self.clustering_configs = constants.clustering_configs['merged']
        self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path)}

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        # set the output directory, and output file paths
        self.bin_output_directory = os.path.join(self.output_directory, bin_name)
        filesnpaths.gen_output_directory(self.bin_output_directory)

        # let's see whether we are going to do any hierarchical clustering:
        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
        self.skip_hierarchical_clustering = self.is_hierarchical_clustering_for_bin_OK()

        # set your own db paths
        self.bin_contigs_db_path = os.path.join(self.bin_output_directory, 'CONTIGS.db')
        self.bin_profile_db_path = os.path.join(self.bin_output_directory, 'PROFILE.db')
Esempio n. 40
0
    def generate_pages_for_artifacts(self):
        """Generates static pages for artifacts in the output directory"""

        for artifact in ANVIO_ARTIFACTS:
            d = {
                'artifact': ANVIO_ARTIFACTS[artifact],
                'meta': {
                    'summary_type':
                    'artifact',
                    'version':
                    '\n'.join([
                        '|%s|%s|' % (t[0], t[1])
                        for t in anvio.get_version_tuples()
                    ]),
                    'date':
                    utils.get_date()
                }
            }

            d['artifact']['name'] = artifact
            d['artifact']['required_by'] = [
                (r, '../../programs/%s' % r)
                for r in self.artifacts_info[artifact]['required_by']
            ]
            d['artifact']['provided_by'] = [
                (r, '../../programs/%s' % r)
                for r in self.artifacts_info[artifact]['provided_by']
            ]
            d['artifact']['description'] = self.artifacts_info[artifact][
                'description']
            d['artifact'][
                'icon'] = '../../images/icons/%s.png' % ANVIO_ARTIFACTS[
                    artifact]['type']

            if anvio.DEBUG:
                run.warning(None, 'THE OUTPUT DICT')
                import json
                print(json.dumps(d, indent=2))

            artifact_output_dir = filesnpaths.gen_output_directory(
                os.path.join(self.artifacts_output_dir, artifact))
            output_file_path = os.path.join(artifact_output_dir, 'index.md')
            open(output_file_path,
                 'w').write(SummaryHTMLOutput(d, r=run, p=progress).render())
Esempio n. 41
0
    def sanity_check(self):
        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError("You cannot give MODELLER a non-empty directory to work in.")
        else:
            filesnpaths.gen_output_directory(self.directory)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts')
        if utils.filesnpaths.is_dir_empty(self.scripts_folder):
            raise ConfigError("Anvi'o houses all its MODELLER scripts in {}, but your directory \
                               contains no scripts. Why you do dat?")

        # check that MODELLER exists
        if self.args.__dict__['modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None:
            self.run.info_single("As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1)
            utils.is_program_exists(self.executable)
        else:
            try:
                utils.is_program_exists(self.executable)
            except ConfigError as e:
                raise ConfigError("Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\
                                   (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\
                                   it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\
                                   by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\
                                   don't have it on your system, check out these installation instructions on our website:\
                                   http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable))

            self.run.info_single("Anvi'o found the default executable for MODELLER, `%s`, and will\
                                  use it." % self.executable, nl_before=1)
        self.is_executable_a_MODELLER_program()

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError("MODELLER::The input FASTA file must have exactly one sequence.\
                               You provided one with {}.".format(target_fasta.total_seq))

        # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened)
        while next(target_fasta):
            self.corresponding_gene_call = target_fasta.id
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning("You realize that deviation is given in angstroms, right? You chose {}".format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.run.warning("Since you chose --very-fast, there will be little difference, if at all, between models. You \
                              can potentially save a lot of time by setting --num-models to 1.")

        if self.percent_identical_cutoff <= 20:
            self.run.warning("Two completely unrelated sequences of same length can expect to have around 10% proper \
                              percent identicalness... Having this parameter below 20% is probably a bad idea.")
Esempio n. 42
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        self.contigs_db_hash = self.input_runinfo_dicts.values(
        )[0]['contigs_db_hash']
        self.min_contig_length = self.input_runinfo_dicts.values(
        )[0]['min_contig_length']
        self.num_contigs = self.input_runinfo_dicts.values()[0]['num_contigs']
        self.num_splits = self.input_runinfo_dicts.values()[0]['num_splits']
        self.min_coverage_for_variability = self.input_runinfo_dicts.values(
        )[0]['min_coverage_for_variability']
        self.total_length = self.input_runinfo_dicts.values(
        )[0]['total_length']
        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'contigs_db_hash': self.contigs_db_hash
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        self.progress.new('Merging variable positions tables')
        self.merge_variable_positions_tables()
        self.progress.end()

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Esempio n. 43
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        C = lambda x: self.input_runinfo_dicts.values()[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.total_reads_mapped = C('total_reads_mapped')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.gene_coverages_computed = C('gene_coverages_computed')
        self.AA_frequencies_profiled = C('profile_AA_frequencies')
        self.SNVs_profiled = not C('skip_SNV_profiling')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'blank': False,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': self.SNVs_profiled,
            'AA_frequencies_profiled': self.AA_frequencies_profiled,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'total_reads_mapped': self.total_reads_mapped,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.contigs_db_hash,
            'gene_coverages_computed': self.gene_coverages_computed
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.AA_frequencies_profiled:
            self.progress.new('Merging variable AAs tables')
            self.merge_variable_aas_tables()
            self.progress.end()
        else:
            self.run.warning(
                "AA frequencies were not profiled, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Esempio n. 44
0
    def sanity_check(self):
        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError("You cannot give MODELLER a non-empty directory to work in.")
        else:
            filesnpaths.gen_output_directory(self.directory)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts')
        if utils.filesnpaths.is_dir_empty(self.scripts_folder):
            raise ConfigError("Anvi'o houses all its MODELLER scripts in {}, but your directory \
                               contains no scripts. Why you do dat?")

        # check that MODELLER exists
        if self.args.__dict__['modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None:
            self.run.info_single("As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1)
            utils.is_program_exists(self.executable)
        else:
            try:
                utils.is_program_exists(self.executable)
            except ConfigError as e:
                raise ConfigError("Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\
                                   (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\
                                   it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\
                                   by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\
                                   don't have it on your system, check out these installation instructions on our website:\
                                   http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable))

            self.run.info_single("Anvi'o found the default executable for MODELLER, `%s`, and will\
                                  use it." % self.executable, nl_before=1)
        self.is_executable_a_MODELLER_program()

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError("MODELLER::The input FASTA file must have exactly one sequence.\
                               You provided one with {}.".format(target_fasta.total_seq))

        # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened)
        while next(target_fasta):
            self.corresponding_gene_call = target_fasta.id
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning("You realize that deviation is given in angstroms, right? You chose {}".format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.run.warning("Since you chose --very-fast, there will be little difference, if at all, between models. You \
                              can potentially save a lot of time by setting --num-models to 1.")

        if self.percent_identical_cutoff <= 20:
            self.run.warning("Two completely unrelated sequences of same length can expect to have around 10% proper \
                              percent identicalness... Having this parameter below 20% is probably a bad idea.")
Esempio n. 45
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        self.run.log_file_path = os.path.join(self.output_directory,
                                              'RUNLOG.txt')

        # set database paths
        self.merged_profile_db_path = os.path.join(self.output_directory,
                                                   'PROFILE.db')
        self.database_paths['PROFILE.db'] = os.path.abspath(
            self.merged_profile_db_path)

        profile_db = dbops.ProfileDatabase(self.merged_profile_db_path)

        C = lambda x: list(self.profile_dbs_info_dict.values())[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.max_contig_length = C('max_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.SCVs_profiled = C('SCVs_profiled')
        self.SNVs_profiled = C('SNVs_profiled')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.num_splits))

        self.total_reads_mapped_per_sample = dict([
            (s, self.layer_additional_data_dict['default'][s]
             ['total_reads_mapped'])
            for s in self.layer_additional_data_dict['default']
        ])

        sample_ids_list = ', '.join(sorted(self.sample_ids_found_in_input_dbs))
        total_reads_mapped_list = ', '.join([
            str(self.total_reads_mapped_per_sample[sample_id])
            for sample_id in self.sample_ids_found_in_input_dbs
        ])

        # we run this now because we change default flags in this function
        # depending on the number of reads characterized within each single profile.
        self.set_normalization_multiplier()

        meta_values = {
            'db_type':
            'profile',
            'anvio':
            __version__,
            'sample_id':
            self.sample_id,
            'samples':
            sample_ids_list,
            'total_reads_mapped':
            total_reads_mapped_list,
            'merged':
            True,
            'blank':
            False,
            'items_ordered':
            False,
            'default_view':
            'mean_coverage',
            'min_contig_length':
            self.min_contig_length,
            'max_contig_length':
            self.max_contig_length,
            'SNVs_profiled':
            self.SNVs_profiled,
            'SCVs_profiled':
            self.SCVs_profiled,
            'num_contigs':
            self.num_contigs,
            'num_splits':
            self.num_splits,
            'total_length':
            self.total_length,
            'min_coverage_for_variability':
            self.min_coverage_for_variability,
            'report_variability_full':
            self.report_variability_full,
            'contigs_db_hash':
            self.contigs_db_hash,
            'description':
            self.description
            if self.description else '_No description is provided_'
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )

        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.merged_profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('num_runs_processed',
                      len(self.sample_ids_found_in_input_dbs))
        self.run.info('merged_sample_ids', sample_ids_list)
        self.run.info("Common layer additional data keys",
                      ', '.join(self.layer_additional_data_keys))
        self.run.info('total_reads_mapped', total_reads_mapped_list)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.merge_split_coverage_data()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.progress.update('...')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.SCVs_profiled:
            self.progress.new('Merging variable codons tables')
            self.progress.update('...')
            self.merge_variable_codons_tables()
            self.progress.end()
        else:
            self.run.warning(
                "Codon frequencies were not profiled, hence, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        self.populate_misc_data_tables()

        self.run.info_single('Happy ☘ ', nl_before=1, nl_after=1)

        self.run.quit()
Esempio n. 46
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists = self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory, 'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        self.contigs_db_hash = self.input_runinfo_dicts.values()[0]['contigs_db_hash']
        self.min_contig_length = self.input_runinfo_dicts.values()[0]['min_contig_length']
        self.num_contigs = self.input_runinfo_dicts.values()[0]['num_contigs']
        self.num_splits = self.input_runinfo_dicts.values()[0]['num_splits']
        self.min_coverage_for_variability = self.input_runinfo_dicts.values()[0]['min_coverage_for_variability']
        self.total_length = self.input_runinfo_dicts.values()[0]['total_length']
        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': ','.join(self.merged_sample_ids),
                       'merged': True,
                       'contigs_clustered': not self.skip_hierarchical_clustering,
                       'default_view': 'mean_coverage',
                       'min_contig_length': self.min_contig_length,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'num_contigs': self.num_contigs,
                       'num_splits': self.num_splits,
                       'total_length': self.total_length,
                       'contigs_db_hash': self.contigs_db_hash}
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables()
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed', not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        self.progress.new('Merging variable positions tables')
        self.merge_variable_positions_tables()
        self.progress.end()

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized, strip_prefix = self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Esempio n. 47
0
    def init(self):
        """This function is called from within the snakefile to initialize parameters."""
        super().init()

        self.run_iu_merge_pairs = self.get_param_value_from_config(
            ['iu_merge_pairs', 'run'])
        self.gzip_iu_merge_pairs_output = self.get_param_value_from_config([
            'iu_merge_pairs', '--gzip-output'
        ]) if self.run_iu_merge_pairs else False
        self.run_anvi_reformat_fasta = self.get_param_value_from_config(
            ['anvi_reformat_fasta', 'run'])
        self.gzip_anvi_reformat_output = self.get_param_value_from_config([
            'anvi_reformat_fasta', '--gzip-output'
        ]) if self.run_anvi_reformat_fasta else False
        self.run_anvi_trnaseq = self.get_param_value_from_config(
            ['anvi_trnaseq', 'run'])
        self.run_anvi_merge_trnaseq = self.get_param_value_from_config(
            ['anvi_merge_trnaseq', 'run'])
        self.run_anvi_run_trna_taxonomy = self.get_param_value_from_config(
            ['anvi_run_trna_taxonomy', 'run'])
        self.run_anvi_tabulate_trnaseq = self.get_param_value_from_config(
            ['anvi_tabulate_trnaseq', 'run'])

        # Load table of sample info from samples_txt.
        self.samples_txt_file = self.get_param_value_from_config(
            ['samples_txt'])
        filesnpaths.is_file_exists(self.samples_txt_file)
        try:
            # An error will subsequently be raised in `check_samples_txt` if there is no header.
            self.sample_info = pd.read_csv(self.samples_txt_file,
                                           sep='\t',
                                           index_col=False)
        except IndexError as e:
            raise ConfigError(
                "The samples_txt file, '%s', does not appear to be properly formatted. "
                "This is the error from trying to load it: '%s'" %
                (self.samples_txt_file, e))
        self.check_samples_txt()

        self.sample_names = self.sample_info['sample'].tolist()
        if 'treatment' not in self.sample_info:
            # The treatment is the same for each sample and is set in the config file.
            self.sample_info['treatment'] = [
                self.get_param_value_from_config(
                    ['anvi_trnaseq', '--treatment'])
            ] * len(self.sample_names)
        if self.run_iu_merge_pairs:
            self.treatments = self.sample_info['treatment']
            self.r1_paths = self.sample_info['r1'].tolist()
            self.r2_paths = self.sample_info['r2'].tolist()
            self.r1_prefixes = self.get_r1_prefixes()
            self.r2_prefixes = self.get_r2_prefixes()
            self.fasta_paths = None
        else:
            self.treatments = self.sample_info['treatment']
            self.r1_paths = None
            self.r2_paths = None
            self.r1_prefixes = None
            self.r2_prefixes = None
            self.fasta_paths = self.sample_info['fasta'].tolist()

        self.target_files = self.get_target_files()

        # The `anvi-run-workflow --cluster` option, which submits each rule as a separate job,
        # requires that the rule's log directory exist before running the rule. This workflow
        # differs from others by writing log files for each sample to log directories for each
        # sample.
        for sample_name in self.sample_names:
            filesnpaths.gen_output_directory(
                os.path.join(self.dirs_dict['LOGS_DIR'], sample_name))
Esempio n. 48
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists=self.overwrite_output_destinations)

        self.run.log_file_path = os.path.join(self.output_directory, 'RUNLOG.txt')

        # set database paths
        self.merged_profile_db_path = os.path.join(self.output_directory, 'PROFILE.db')
        self.database_paths['PROFILE.db'] = os.path.abspath(self.merged_profile_db_path)

        profile_db = dbops.ProfileDatabase(self.merged_profile_db_path)

        C = lambda x: list(self.profile_dbs_info_dict.values())[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.max_contig_length = C('max_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.SCVs_profiled = C('SCVs_profiled')
        self.SNVs_profiled = C('SNVs_profiled')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.num_splits))

        self.total_reads_mapped_per_sample = dict([(s, self.layer_additional_data_dict['default'][s]['total_reads_mapped']) for s in self.layer_additional_data_dict['default']])

        sample_ids_list = ', '.join(sorted(self.sample_ids_found_in_input_dbs))
        total_reads_mapped_list = ', '.join([str(self.total_reads_mapped_per_sample[sample_id]) for sample_id in self.sample_ids_found_in_input_dbs])

        # we run this now because we change default flags in this function
        # depending on the number of reads characterized within each single profile.
        self.set_normalization_multiplier()

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': sample_ids_list,
                       'total_reads_mapped': total_reads_mapped_list,
                       'merged': True,
                       'blank': False,
                       'items_ordered': False,
                       'default_view': 'mean_coverage',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': self.SNVs_profiled,
                       'SCVs_profiled': self.SCVs_profiled,
                       'num_contigs': self.num_contigs,
                       'num_splits': self.num_splits,
                       'total_length': self.total_length,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.contigs_db_hash,
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables()

        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('description', 'Found (%d characters)' % len(self.description) if self.description else None)
        self.run.info('profile_db', self.merged_profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('num_runs_processed', len(self.sample_ids_found_in_input_dbs))
        self.run.info('merged_sample_ids', sample_ids_list)
        self.run.info("Common layer additional data keys", ', '.join(self.layer_additional_data_keys))
        self.run.info('total_reads_mapped', total_reads_mapped_list)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('clustering_performed', not self.skip_hierarchical_clustering)

        self.merge_split_coverage_data()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.progress.update('...')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning("SNVs were not profiled, variable nt positions tables will be empty in the merged profile database.")

        if self.SCVs_profiled:
            self.progress.new('Merging variable codons tables')
            self.progress.update('...')
            self.merge_variable_codons_tables()
            self.progress.end()
        else:
            self.run.warning("Codon frequencies were not profiled, hence, these tables will be empty in the merged profile database.")

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.populate_misc_data_tables()

        self.run.info_single('Happy ☘ ', nl_before=1, nl_after=1)

        self.run.quit()
Esempio n. 49
0
    def run_hmmer(self,
                  source,
                  alphabet,
                  context,
                  kind,
                  domain,
                  num_genes_in_model,
                  hmm,
                  ref,
                  noise_cutoff_terms,
                  desired_output='table',
                  hmmer_output_dir=None):
        """Run the program

        Parameters
        ==========
        source : str
            A name for your HMM effort.

        alphabet : str
            Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'}

        context : str
            This will determine how your output is processed. FIXME Documentation is lacking. Choose
            from {'GENE', 'CONTIG', 'DOMAIN'}.

        kind : str
            Used for user stdout info. Don't by afraid to pass None

        domain : str
            Used for user stdout info. Don't by afraid to pass None

        num_genes_in_model : int
            Used for user stdout info. Don't by afraid to pass None

        hmm : str
            Path to the input .hmm file

        ref : int
            Used for user stdout info. Don't by afraid to pass None

        noise_cutoff_terms : str
            Filter out hits with built-in flags. e.g. '--cut_ga'

        desired_output : str OR list, 'table'
            HMMER programs have a couple of outputs. For the standard output (specified by the hmmer
            program flag `-o`), pass 'standard'. For the regular tabular output (specified by the hmmer
            program flag `--tblout`), pass 'table'. For the domain tabular output (specified by the hmmer
            program flag `--domtblout`), pass 'domtable'. If you want to use multiple, pass a tuple like
            ('standard', 'table')

        hmmer_output_dir : str
            The path at which to store the HMMER output files, if desired. After all HMMER workers are
            done and their partial output files have been combined into one (for each type), those combined
            output files will be moved to this location.
        """

        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context "
                "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        if isinstance(desired_output, str):
            desired_output = (desired_output, )

        for output in desired_output:
            if output not in ['standard', 'table', 'domtable']:
                raise ConfigError(
                    "HMMer.run_hmmer :: Unknown desired_output, '%s'" % output)

        if hmmer_output_dir:
            if not os.path.exists(hmmer_output_dir):
                filesnpaths.gen_output_directory(hmmer_output_dir)
            else:
                filesnpaths.is_output_dir_writable(hmmer_output_dir)
                for output in desired_output:
                    file_path = os.path.join(hmmer_output_dir, f"hmm.{output}")
                    if filesnpaths.is_file_exists(file_path, dont_raise=True):
                        raise ConfigError(
                            f"The file {file_path} already exists, and anvi'o does not like to "
                            "to overwrite things. Please either remove the file or rename your "
                            "desired output.")

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N/A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes in HMM model', num_genes_in_model
                      or 'unknown')
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)
        if alphabet in ['DNA', 'RNA']:
            self.run.info('HMMer program used for search', 'nhmmscan')
            if 'domtable' in desired_output:
                raise ConfigError(
                    "Oh, dear. Someone (probably a programmer) has requested domain table output from "
                    f"the run_hmmer() function when the alphabet is {alphabet}. Sadly, this will not "
                    "work because that alphabet requires the use of `nhmmscan`, which does not have "
                    "the --domtblout parameter.")
        else:
            self.run.info('HMMer program used for search', self.program_to_use)

        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        self.run.info('Temporary work dir', tmp_dir)

        # check if all hmmpress files are in the HMM directory
        self.verify_hmmpress_output(hmm)

        workers = []
        manager = multiprocessing.Manager(
        )  # this dude holds the shared objects that will be modified by workers
        ret_value_queue = manager.Queue(maxsize=self.num_threads_to_use)
        output_queue = manager.Queue()

        # Holds buffer and write lock for each output
        merged_files_dict = {}
        for output in desired_output:
            merged_files_dict[output] = {
                'buffer': io.StringIO(),
                'lock': manager.Lock()
            }

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        original_num_threads_requested = None
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning(
                f"You requested {P('core', self.num_threads_to_use)} but there were only {P('sequence', num_parts)} "
                f"in the FASTA file for the target '{target}'. Anvi'o will use {P('process', num_parts, sfp='es')} "
                f"with {P('core', cores_per_process)} instead. And that's that."
            )

            # if we need to change the number of threads for a SINGLE run, then we need to keep
            # in mind and set the originally reqeusted number of threads. not doing that leads
            # to an extremely tricky bug that is described here thanks to help from Daan Speth:
            # https://github.com/merenlab/anvio/issues/1748
            original_num_threads_requested = self.num_threads_to_use
            self.num_threads_to_use = num_parts

        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
            self.run.warning(
                "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
                "We hope that is alright." % (self.program_to_use, alphabet))

        thread_num = 0
        for partial_input_file in self.target_files_dict[target]:
            log_file = partial_input_file + '_log'
            output_file = partial_input_file + '_output'
            table_file = partial_input_file + '_table'
            if 'domtable' in desired_output:
                domtable_file = partial_input_file + '_domtable'
            else:
                domtable_file = None

            self.run.info('Log file for thread %s' % thread_num, log_file)
            thread_num += 1

            if noise_cutoff_terms:
                if 'domtable' in desired_output:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, *noise_cutoff_terms.split(),
                        '--cpu', cores_per_process, '--tblout', table_file,
                        '--domtblout', domtable_file, hmm, partial_input_file
                    ]
                else:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, *noise_cutoff_terms.split(),
                        '--cpu', cores_per_process, '--tblout', table_file,
                        hmm, partial_input_file
                    ]
            else:  # if we didn't pass any noise cutoff terms, here we don't include them in the command line
                if 'domtable' in desired_output:
                    cmd_line = [
                        'nhmmscan' if alphabet in ['DNA', 'RNA'] else
                        self.program_to_use, '-o', output_file, '--cpu',
                        cores_per_process, '--tblout', table_file,
                        '--domtblout', domtable_file, hmm, partial_input_file
                    ]
                else:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, '--cpu', cores_per_process,
                        '--tblout', table_file, hmm, partial_input_file
                    ]

            t = multiprocessing.Process(
                target=self.hmmer_worker,
                args=(partial_input_file, cmd_line, table_file, output_file,
                      desired_output, log_file, output_queue, ret_value_queue,
                      domtable_file))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update(
            f'Running {self.program_to_use} in {P("thread", self.num_threads_to_use)}...'
        )

        finished_workers = 0
        while finished_workers < self.num_threads_to_use:
            try:
                ret_value = ret_value_queue.get()

                if isinstance(ret_value, Exception):
                    # If thread returns an exception, we raise it and kill the main thread.
                    raise ret_value

                finished_workers += 1
                if ret_value == 0:
                    if anvio.DEBUG:
                        self.run.info_single(
                            f"{finished_workers} out of {self.num_threads_to_use} have finished"
                        )
                else:
                    raise ConfigError(
                        "An HMMER worker thread came back with an unexpected return value of {ret_value}. "
                        "Something is probably wrong, so you should contact a developer for help."
                    )

                # if worker finished successfully we can take its individual output file(s) and append them to the main file(s)
                output_dict = output_queue.get()
                for file_type, file in output_dict.items():
                    main_file_buffer = merged_files_dict[file_type]['buffer']
                    main_file_lock = merged_files_dict[file_type]['lock']
                    worker_file = file
                    if file_type == 'table':
                        append_function = self.append_to_main_table_file
                    elif file_type == 'standard':
                        append_function = self.append_to_main_standard_file
                    elif file_type == 'domtable':
                        append_function = self.append_to_main_table_file

                    append_function(main_file_buffer, worker_file,
                                    main_file_lock)

            except KeyboardInterrupt:
                self.run.info_single(
                    "HMMER driver received SIGINT, terminating all threads...",
                    nl_before=2)
                break

            except Exception as worker_error:
                # An exception was thrown in one of the threads so we kill all of them
                self.progress.end()
                self.run.warning(
                    "An exception was thrown in one of the worker threads (see output below for details)."
                )
                for worker in workers:
                    worker.terminate()
                raise worker_error

        for worker in workers:
            worker.terminate()

        self.progress.end()

        if original_num_threads_requested:
            self.num_threads_to_use = original_num_threads_requested
            self.run.info_single(
                f'Done with {source} 🎊 (and num threads requested is set back to {self.num_threads_to_use}).',
                level=0,
                nl_before=1,
                nl_after=1,
                mc="cyan")
        else:
            self.run.info_single(f'Done with {source} 🎊',
                                 level=0,
                                 nl_before=1,
                                 nl_after=1,
                                 mc="cyan")

        output_file_paths = []
        for output in desired_output:
            if hmmer_output_dir:
                output_file_path = os.path.join(hmmer_output_dir,
                                                f"hmm.{output}")
            else:
                output_file_path = os.path.join(tmp_dir, f"hmm.{output}")

            with open(output_file_path, 'w') as out:
                merged_files_dict[output]['buffer'].seek(0)
                out.write(merged_files_dict[output]['buffer'].read())

            if output == 'table' or output == 'domtable':
                num_raw_hits = filesnpaths.get_num_lines_in_file(
                    output_file_path)
                self.run.info(f'Number of raw hits in {output} file',
                              num_raw_hits,
                              progress=self.progress)
                output_file_path = output_file_path if num_raw_hits else None

            output_file_paths.append(output_file_path)

        # Return output path as string if desired_output is len 1. Else return tuple of output paths
        output = output_file_paths[0] if len(
            output_file_paths) == 1 else tuple(output_file_paths)

        return output