Beispiel #1
0
    def __init__(self, args):
        self.progress = terminal.Progress()
        self.run = terminal.Run()
        self.args = args
        self.args.mode = 'refine'

        self.bins = set([])

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.bin_ids_file_path = A('bin_ids_file')
        self.bin_id = A('bin_id')
        self.collection_name = A('collection_name')
        self.contigs_db_path = A('contigs_db')
        self.profile_db_path = A('profile_db')
        self.debug = A('debug')

        dbops.is_contigs_db(self.contigs_db_path)
        dbops.is_profile_db(self.profile_db_path)

        self.database_paths = {'CONTIGS.db': self.contigs_db_path,
                               'PROFILE.db': self.profile_db_path}
        self.is_merged = None
        self.split_names_of_interest = set([])

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.is_merged = int(profile_db.meta['merged'])
        profile_db.disconnect()

        self.clustering_configs = constants.clustering_configs['merged' if self.is_merged else 'single']
Beispiel #2
0
    def get_genome_hash_for_external_genome(self, entry):
        dbops.is_contigs_db(entry['contigs_db_path'])
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        return genome_hash
Beispiel #3
0
    def __init__(self, args):
        self.progress = terminal.Progress()
        self.run = terminal.Run()
        self.args = args
        self.args.mode = 'refine'

        self.bins = set([])

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.bin_ids_file_path = A('bin_ids_file')
        self.bin_id = A('bin_id')
        self.collection_name = A('collection_name')
        self.contigs_db_path = A('contigs_db')
        self.profile_db_path = A('profile_db')
        self.debug = A('debug')

        dbops.is_contigs_db(self.contigs_db_path)
        dbops.is_profile_db(self.profile_db_path)

        self.database_paths = {'CONTIGS.db': self.contigs_db_path,
                               'PROFILE.db': self.profile_db_path}
        self.is_merged = None
        self.split_names_of_interest = set([])

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.is_merged = int(profile_db.meta['merged'])
        profile_db.disconnect()

        self.clustering_configs = constants.clustering_configs['merged' if self.is_merged else 'single']
Beispiel #4
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                                database formatted under the COGs data directory for that program :/ You may need to\
                                re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                                anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                                time to point the right directory using the --cog-data-dir parameter."                                                                                                       % \
                                                                                (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError(
                "You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                                sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError(
                "You can't provide both an AA sequences file and a contigs database. Choose one!"
            )

        if self.contigs_db_path:
            dbops.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning(
                "Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later."
            )

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run',
                      self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_factory[self.search_with](
            aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(
            search_results_tabular,
            target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Beispiel #5
0
    def get_genome_hash_for_internal_genome(self, entry):
        dbops.is_contigs_db(entry['contigs_db_path'])
        split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(entry)
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = hashlib.sha224('_'.join([''.join(split_names_of_interest), contigs_db.meta['contigs_db_hash']]).encode('utf-8')).hexdigest()[0:12]
        contigs_db.disconnect()

        return genome_hash
Beispiel #6
0
class EggNOGMapper:
    """An interface class between eggnog-mapper and anvi'o.
    
       The default client of this class is `anvi-script-run-eggnog-mapper`. It may have changed already, but
       if it didn't, you should be able to run it this way to run eggnog-mapper on stuff in a contigs database,
       and store results in it:

            $ anvi-script-run-eggnog-mapper -c CONGITS_DB --num-threads NUM_THREADS

       If you are interested in some ad hoc testing with existing annotation files, you can do stuff like this after
       running `anvi-self-test --suite pangenomics` once (so you have the contigs database to annotate):

            $ ipython
            >>> from anvio.drivers.emapper import EggNOGMapper
            >>> e = EggNOGMapper('tests/sandbox/test-output/pan_test/01.db', num_threads=1)
            >>> e.populate_annotations_dict('tests/sandbox/mock_data_for_pangenomics/emapper_hits/aa_sequences_01.emapper.annotations')
            >>> e.store_annotations_in_db()

        Happy? Good. Not happy? Write to me about it!
       """
    def __init__(self,
                 args,
                 database='bact',
                 executable='emapper.py',
                 usemem=True,
                 use_version=None,
                 progress=progress,
                 run=run):
        self.executable = executable
        self.progress = progress
        self.run = run

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.contigs_db_path = A('contigs_db')
        self.num_threads = A('num_threads')
        self.usemem = usemem

        self.COGs_data = cogs.COGsData(args)

        if not self.COGs_data.initialized:
            raise ConfigError, "It seems you don't have your COG data set up on this system. Unfortunately EggNOGmapper class\
                                depends on it, so this is the end of the road for you. If you set up your COG directory to\
                                a specific path, you can use `--cog-data-dir` parameter to show anvi'o where it is. If you\
                                never set up one, then maybe it is time for you to take a look at the program\
                                `anvi-setup-ncbi-cogs`."

        try:
            self.num_threads = int(
                self.num_threads) if self.num_threads else None
        except Exception, e:
            raise ConfigError, "Someone didn't like the number of threads, and said [%s]. Shame on you :/" % e

        if database not in ['euk', 'bact', 'arch']:
            raise ConfigError, "Wrong database (%s). eggnog-mapper knows only about euk, bact, or arch db types..." % (
                database)
        else:
            self.database = database

        if self.contigs_db_path:
            dbops.is_contigs_db(self.contigs_db_path)

        self.parser = None
        self.entry_id = 0
        self.installed_version = None
        self.aa_sequences_file_name = 'aa_sequences.fa'
        self.log_file_path = 'log.txt'
        self.output_file_prefix = 'project'
        self.annotations_file_name = self.output_file_prefix + '.emapper.annotations'
        self.annotations_dict = {}

        # this is a shitty workaround to make sure integers used as gene caller ids by anvi'o will not
        # cause any issues downstream (because they did in the past when silly programs started treating
        # them as numerical data and then converted them to float, and then storing them as 1.0, 2.0, etc).
        self.gene_caller_id_prefix = 'g'

        self.available_parsers = {'0.12.6': self.__parser_1}

        self.check_version(use_version)

        if not self.num_threads:
            try:
                run.warning(
                    "You have not set the number of threads, and the default is whatever the default is for eggnog-mapper. You\
                             may really want to change that since if you have a large number of genes to annotate, this may take a very\
                             long time. If you don't want to see this message again, just set the number of threads you want eggnog-mapper\
                             to use explicitly. You can press CTRL + C to cancel this run, or simply do nothing since your operation\
                             will contine in probably like 2 seconds or less ... depending how fast you read."
                )
                time.sleep(25)
            except KeyboardInterrupt:
                sys.exit()
Beispiel #7
0
    def __init__(self,
                 args,
                 database='bact',
                 executable='emapper.py',
                 usemem=True,
                 use_version=None,
                 progress=progress,
                 run=run):
        self.executable = executable
        self.progress = progress
        self.run = run

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.contigs_db_path = A('contigs_db')
        self.num_threads = A('num_threads')
        self.usemem = usemem

        self.COGs_data = cogs.COGsData(args)

        if not self.COGs_data.initialized:
            raise ConfigError(
                "It seems you don't have your COG data set up on this system. Unfortunately EggNOGmapper class\
                                depends on it, so this is the end of the road for you. If you set up your COG directory to\
                                a specific path, you can use `--cog-data-dir` parameter to show anvi'o where it is. If you\
                                never set up one, then maybe it is time for you to take a look at the program\
                                `anvi-setup-ncbi-cogs`.")

        try:
            self.num_threads = int(
                self.num_threads) if self.num_threads else None
        except Exception as e:
            raise ConfigError(
                "Someone didn't like the number of threads, and said [%s]. Shame on you :/"
                % e)

        if database not in ['euk', 'bact', 'arch']:
            raise ConfigError(
                "Wrong database (%s). eggnog-mapper knows only about euk, bact, or arch db types..."
                % (database))
        else:
            self.database = database

        if self.contigs_db_path:
            dbops.is_contigs_db(self.contigs_db_path)

        self.parser = None
        self.entry_id = 0
        self.installed_version = None
        self.aa_sequences_file_name = 'aa_sequences.fa'
        self.log_file_path = 'log.txt'
        self.output_file_prefix = 'project'
        self.annotations_file_name = self.output_file_prefix + '.emapper.annotations'
        self.annotations_dict = {}

        # this is a shitty workaround to make sure integers used as gene caller ids by anvi'o will not
        # cause any issues downstream (because they did in the past when silly programs started treating
        # them as numerical data and then converted them to float, and then storing them as 1.0, 2.0, etc).
        self.gene_caller_id_prefix = 'g'

        self.available_parsers = {'0.12.6': self.__parser_1}

        self.check_version(use_version)

        if not self.num_threads:
            try:
                run.warning(
                    "You have not set the number of threads, and the default is whatever the default is for eggnog-mapper. You\
                             may really want to change that since if you have a large number of genes to annotate, this may take a very\
                             long time. If you don't want to see this message again, just set the number of threads you want eggnog-mapper\
                             to use explicitly. You can press CTRL + C to cancel this run, or simply do nothing since your operation\
                             will contine in probably like 2 seconds or less ... depending how fast you read."
                )
                time.sleep(25)
            except KeyboardInterrupt:
                sys.exit()