Example #1
0
    def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use='hmmscan', progress=progress, run=run):
        """A class to streamline HMM runs.

        Notes
        =====
        - HMMer user guide: http://eddylab.org/software/hmmer/Userguide.pdf
        """

        self.num_threads_to_use = num_threads_to_use
        self.program_to_use = program_to_use
        self.progress = progress
        self.run = run

        self.tmp_dirs = []
        self.target_files_dict = {}

        acceptable_programs = ["hmmscan", "hmmsearch"]
        if self.program_to_use not in acceptable_programs:
            raise ConfigError("HMMer class here. You are attempting to use the program %s to run HMMs, but we don't recognize it. The currently "
                              "supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs)))

        for source in target_files_dict:
            tmp_dir = filesnpaths.get_temp_directory_path()
            self.tmp_dirs.append(tmp_dir)

            part_file_name = os.path.join(tmp_dir, os.path.basename(target_files_dict[source]))

            # create splitted fasta files inside tmp directory
            self.target_files_dict[source] = utils.split_fasta(target_files_dict[source],
                                                               parts=self.num_threads_to_use,
                                                               prefix=part_file_name)
Example #2
0
    def __init__(self,
                 target_files_dict,
                 num_threads_to_use=1,
                 progress=progress,
                 run=run):
        """A class to streamline HMM runs."""
        self.num_threads_to_use = num_threads_to_use
        self.progress = progress
        self.run = run

        self.tmp_dirs = []
        self.target_files_dict = {}

        for source in target_files_dict:
            tmp_dir = filesnpaths.get_temp_directory_path()
            self.tmp_dirs.append(tmp_dir)

            part_file_name = os.path.join(
                tmp_dir, os.path.basename(target_files_dict[source]))

            # create splitted fasta files inside tmp directory
            self.target_files_dict[source] = utils.split_fasta(
                target_files_dict[source],
                parts=self.num_threads_to_use,
                prefix=part_file_name)
Example #3
0
    def move_old_COG_data_to_its_new_location(self):
        try:
            filesnpaths.is_output_dir_writable(self.COG_base_dir)
        except:
            raise ConfigError(f"Please read this carefully: The NCBI has made a new release of COGs. To make room for that "
                              f"while maintaining the old COG data from 2014 version, anvi'o needs to move some files around. "
                              f"While anvi'o can do it automatically, your user does not seem to have permission to do that. "
                              f"One alternative is to ask your system administrator to run this program on your behalf. It will "
                              f"solve everything. OR you can ask them to do exactly these steps: (1) go to the directory "
                              f"'{self.COG_base_dir}', (2) create a new directory called `COG14`, and (3) move everything in "
                              f"'{self.COG_base_dir}' (WHICH INCLUDES the files: CATEGORIES.txt, COG.txt, DB_BLAST/ "
                              f"DB_DIAMOND/, MISSING_COG_IDs.cPickle, PID-TO-CID.cPickle, and RAW_DATA_FROM_NCBI/ as well as the "
                              f"hidden file .VERSION) into the new `COG14` directory. Then you will be golden.")

        # we have the write permission, so let's do this.
        tmp_dir = filesnpaths.get_temp_directory_path(just_the_path=True)
        self.run.warning(f"This is a bit important: The NCBI has made a new release of COGs. To make room for that "
                         f"while maintaining the old COG data from 2014 version, anvi'o needs to move some files around. "
                         f"It seems you have the necessary permissions to write into anvi'o misc data directory, so anvi'o "
                         f"will now attempt to do it automatically by first moving things to a temporary directory "
                         f"('{tmp_dir}') and then moving them back into their new target location. If you have not been "
                         f"having an exceptionally bad day, this should go smoothly. But if you see an error below, anvi'o is "
                         f"very sorry for breaking itself on your system :( In which case please find us on our Slack channel "
                         f"and we will try to help you to sort things out.")
        self.progress.new("Moving files around")
        shutil.move(self.COG_base_dir, tmp_dir)
        os.makedirs(self.COG_base_dir)
        shutil.move(tmp_dir, os.path.join(self.COG_base_dir, 'COG14'))

        self.run.info_single("Congratulations! Anvi'o managed to migrate your old data into its new location without breaking "
                             "things. We are all very proud here but let's never do this again.", mc='green', nl_after=1)
Example #4
0
    def run_prodigal(self, fasta_file_path):
        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes')
        self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins')

        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Proteins', self.proteins_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')
        cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' %
                    (fasta_file_path, self.genes_in_contigs,
                     self.proteins_in_contigs, log_file_path))
        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.proteins_in_contigs):
            raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        return self.proteins_in_contigs
Example #5
0
    def run_prodigal(self, fasta_file_path):
        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes')
        self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins')

        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.warning('', header = 'Finding ORFs in contigs', lc = 'green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Proteins', self.proteins_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')
        cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path,
                                                                               self.genes_in_contigs,
                                                                               self.proteins_in_contigs,
                                                                               log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.proteins_in_contigs):
            raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        return self.proteins_in_contigs
Example #6
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                                database formatted under the COGs data directory for that program :/ You may need to\
                                re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                                anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                                time to point the right directory using the --cog-data-dir parameter."                                                                                                       % \
                                                                                (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError(
                "You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                                sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError(
                "You can't provide both an AA sequences file and a contigs database. Choose one!"
            )

        if self.contigs_db_path:
            dbops.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning(
                "Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later."
            )

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run',
                      self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_factory[self.search_with](
            aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(
            search_results_tabular,
            target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Example #7
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # just to make sure we have what it takes to continue later:
        trnascandriver = trnascan_se.tRNAScanSE(self.args,
                                                skip_sanity_check=True)
        trnascandriver.check_programs(quiet=True)

        self.tmp_directory_path = filesnpaths.get_temp_directory_path()

        P = lambda p: os.path.abspath(os.path.expanduser(p))
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.num_threads = A('num_threads') or 1
        self.hits_file_path = P(
            A('trna_hits_file')
            or os.path.join(self.tmp_directory_path, 'hits_file.txt'))
        self.log_file_path = P(
            A('log_file') or os.path.join(self.tmp_directory_path, 'log.txt'))
        self.cutoff_score = A('trna_cutoff_score') or 20
        self.just_do_it = A('just_do_it')

        self.amino_acids = set(
            [aa for aa in constants.AA_to_codons.keys() if aa != 'STP'])
        self.anticodons = set([
            acdn for acdn in constants.anticodon_to_AA.keys()
            if constants.anticodon_to_AA[acdn] in self.amino_acids
        ])

        # the following variable is to meet the requirements of TablesForHMMHits to work with a new HMM
        # source.
        self.source = {
            'ref':
            'Chan and Lowe, https://doi.org/10.1007/978-1-4939-9173-0_1',
            'kind':
            'Transfer_RNAs',
            'domain':
            None,
            'genes': [
                '%s_%s' % (constants.anticodon_to_AA[acdn], acdn)
                for acdn in self.anticodons
            ],
            'target':
            'RNA:CONTIG',
            'noise_cutoff_terms':
            None,
            'model':
            None
        }

        self.source_name = 'Transfer_RNAs'
        self.kind_of_search = self.source['kind']
        self.domain = self.source['domain']
        self.all_genes_searched_against = self.source['genes']
        self.hmm_model = self.source['model']
        self.reference = self.source['ref']
        self.noise_cutoff_terms = self.source['noise_cutoff_terms']
Example #8
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\
                               seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\
                               find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                               database formatted under the COGs data directory for that program :/ You may need to\
                               re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                               anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                               time to point the right directory using the --cog-data-dir parameter, or the environmental\
                               variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                               sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!")

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later.")

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run', self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Example #9
0
    def is_executable_a_MODELLER_program(self):
        # temp_dir created because log file outputs to wherever fasta_to_pir.py is
        temp_dir = filesnpaths.get_temp_directory_path()
        self.copy_script_to_directory('fasta_to_pir.py',
                                      add_to_scripts_dict=False,
                                      directory=temp_dir)
        test_script = J(temp_dir, 'fasta_to_pir.py')
        test_input = os.path.abspath(
            J(os.path.dirname(anvio.__file__),
              '../tests/sandbox/mock_data_for_structure/proteins.fa'))
        test_output = J(temp_dir, 'test_out')

        command = [self.executable, test_script, test_input, test_output]

        # try and execute the command
        process = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        output, error = process.communicate()

        if process.returncode:
            # modeller has failed
            error = error.decode('utf-8').strip()

            is_licence_key_error = True if error.find(
                'Invalid license key') > -1 else False
            if is_licence_key_error:
                # its a valid modeller program with no license key
                license_target_file = error.split('\n')[-1]
                raise ConfigError(
                    "You're making progress and anvi'o is proud of you! You just need to validate your MODELLER\
                                   with a license key (it's free). Please go to https://salilab.org/modeller/registration.html\
                                   to register for a new license. After you receive an e-mail with your key, please open '%s'\
                                   and replace the characters XXXXX with your own key. Save the file and try again. "
                    % license_target_file)

            else:
                error = "\n" + "\n".join(error.split('\n'))
                print(terminal.c(error, color='red'))
                raise ConfigError(
                    "The executable you requested is called `%s`, but anvi'o doesn't agree with you that\
                                   it is a working MODELLER program. That was determined by running the command `%s`, which raised the\
                                   error seen above. If you want to specify a specific MODELLER program, you can specify it with\
                                   `--modeller-executable`." %
                    (self.executable, " ".join(command)))

        # no error was raised. now check if output file exists
        try:
            filesnpaths.is_file_exists(test_output)
        except FilesNPathsError:
            raise ConfigError(
                "The executable you requested is called `%s`, but anvi'o doesn't agree with you that\
                               it is a working MODELLER program. That was determined by running the command `%s`, which did not\
                               output the file expected. If you want to specify a specific MODELLER program, you can specify it with\
                               `--modeller-executable`." %
                (self.executable, " ".join(command)))
Example #10
0
    def process(self):
        """
        """
        # will be empty if all sources in self.residue_annotation_sources_info have "skip": True
        residue_annotation_methods = [info["method"] for _, info in self.residue_annotation_sources_info.items() if not info["skip"]]

        # which genes had structures and which did not. this information is added to the structure database self table
        has_structure = {True: [], False: []}

        num_genes_tried = 0
        num_genes_to_try = len(self.genes_of_interest)

        for corresponding_gene_call in self.genes_of_interest:
            # MODELLER outputs a lot of stuff into its working directory. A temporary directory is
            # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this
            # directory are used in the creation of the structure database. If self.full_modeller_output is
            # provided, these directories and their contents are moved into self.full_modeller_output.
            self.args.directory = filesnpaths.get_temp_directory_path()
            self.args.target_fasta_path = filesnpaths.get_temp_file_path()

            # Export sequence
            dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path,
                                                      self.args.target_fasta_path,
                                                      set([corresponding_gene_call]),
                                                      quiet = True)

            # Model structure
            progress_title = 'Modelling gene ID %d; (%d of %d processed)' % (corresponding_gene_call, num_genes_tried, num_genes_to_try)
            modeller_out = self.run_modeller(corresponding_gene_call, progress_title)
            if modeller_out["structure_exists"]:
                self.run.info_single("Gene successfully modelled!", nl_after=1, mc="green")

            has_structure[modeller_out["structure_exists"]].append(str(corresponding_gene_call))

            # Annotate residues
            residue_info_dataframe = None
            if modeller_out["structure_exists"]:
                residue_info_dataframe = self.run_residue_annotation_for_gene(residue_annotation_methods,
                                                                              corresponding_gene_call,
                                                                              modeller_out["best_model_path"])
            # Append info to tables
            self.append_gene_info_to_tables(modeller_out, residue_info_dataframe)

            # Append metadata to self
            self.update_structure_database_meta_table(has_structure)

            if self.full_modeller_output:
                self.dump_results_to_full_output()

            num_genes_tried += 1

        if not has_structure[True]:
            raise ConfigError("Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'(")

        self.structure_db.disconnect()
        self.run.info("Structure database", self.output_db_path)
Example #11
0
    def process(self):
        output_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(output_dir)
        gene_caller = self.gene_callers[self.gene_caller]()

        gene_calls_dict, protein_sequences_dict = gene_caller.process(self.fasta_file_path, output_dir)

        if not self.debug:
            self.clean_tmp_dirs()

        return gene_calls_dict, protein_sequences_dict
Example #12
0
    def process(self):
        output_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(output_dir)
        gene_caller = self.gene_callers[self.gene_caller]()

        gene_calls_dict, protein_sequences_dict = gene_caller.process(self.fasta_file_path, output_dir)

        if not self.debug:
            self.clean_tmp_dirs()

        return gene_calls_dict, protein_sequences_dict
Example #13
0
    def run_stdin(self, sequences_list, debug=False):
        """Takes a list of tuples for sequences, performs MSA using muscle, returns a dict.

            >>> from anvio.drivers.muscle import Muscle
            >>> m = Muscle()
            >>> m.run_stdin([('seq1', 'ATCATCATCGA'), ('seq2', 'ATCGAGTCGAT')])
            {u'seq1': u'ATCATCATCGA-', u'seq2': u'ATCG-AGTCGAT'}

        """

        tmp_dir = filesnpaths.get_temp_directory_path()
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Running %s' % self.program_name,
                      '%d seqeunces will be aligned' % len(sequences_list))
        self.run.info('Log file path', log_file_path)

        sequences_data = ''.join(
            ['>%s\n%s\n' % (t[0], t[1]) for t in sequences_list])
        cmd_line = [self.program_name, '-quiet']

        output = utils.run_command_STDIN(cmd_line, log_file_path,
                                         sequences_data)

        if not output[0] == '>':
            with open(log_file_path, "a") as log_file:
                log_file.write(
                    '# THIS IS THE OUTPUT YOU ARE LOOKING FOR:\n\n%s\n' %
                    (output))
            raise ConfigError(
                "Drivers::Muscle: Something went wrong with this run :/ The output does not\
                                look alright. You can find the output in this log file: %s"
                % (log_file_path))

        alignments = {}

        # parse the output, and fill alignments
        defline, seq = None, None
        for line in [o for o in output.split('\n') if len(o)] + ['>']:
            if line.startswith('>'):
                if defline:
                    alignments[defline[1:]] = seq
                defline, seq = line, None
            else:
                if not seq:
                    seq = line
                else:
                    seq += line

        if not debug:
            shutil.rmtree(tmp_dir)

        return alignments
Example #14
0
    def run_stdin(self, sequences_list, debug=False):
        """Takes a list of tuples for sequences, performs MSA using famsa, returns a dict.

            >>> from anvio.drivers.famsa import FAMSA
            >>> f = FAMSA()
            >>> f.run_stdin([('seq1', 'ATCATCATCGA'), ('seq2', 'ATCGAGTCGAT')])
            {u'seq1': u'ATCATCATCGA-', u'seq2': u'ATCG-AGTCGAT'}

        """

        tmp_dir = filesnpaths.get_temp_directory_path()
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Running %s' % self.program_name,
                      '%d sequences will be aligned' % len(sequences_list))
        self.run.info('Log file path', log_file_path)

        sequences_data = ''.join(
            ['>%s\n%s\n' % (t[0], t[1]) for t in sequences_list])
        cmd_line = [self.program_name, 'STDIN', 'STDOUT']

        output = utils.run_command_STDIN(cmd_line, log_file_path,
                                         sequences_data)

        if output[0:5] != 'FAMSA' or output[-6:].strip() != "Done!":
            with open(log_file_path, "a") as log_file:
                log_file.write(
                    '# THIS IS THE OUTPUT YOU ARE LOOKING FOR:\n\n%s\n' %
                    (output))
            raise ConfigError(
                "Drivers::FAMSA: Something is worng :/ The output does not like the expected output "
                "for a proper FAMSA run. You can find the output in this log file: %s"
                % (log_file_path))

        alignments = {}

        # parse the output, and fill alignments
        defline, seq = None, None
        for line in [o for o in output.split('\n')[2:-2] if len(o)] + ['>']:
            if line.startswith('>'):
                if defline:
                    alignments[defline[1:]] = seq
                defline, seq = line, None
            else:
                if not seq:
                    seq = line
                else:
                    seq += line

        if not debug:
            shutil.rmtree(tmp_dir)

        return alignments
Example #15
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)
Example #16
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args: pass
        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')}
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['AA:GENE'],
                                                                   simple_headers=True,
                                                                   rna_alphabet=False,
                                                                   report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function': self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug")
        else:
            run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)', nl_before=1, nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #17
0
    def export_sequences_table_in_db_into_FASTA_file(
            self, table=t.contig_sequences_table_name, output_file_path=None):
        if self.db_type != 'contigs':
            return None

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        for seq_id in sequences_table:
            sequences_fasta.write_id(seq_id)
            sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                      split=False)

        self.progress.end()
        self.run.info('Sequences',
                      '%d sequences reported.' % (len(sequences_table)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Example #18
0
    def is_executable_a_MODELLER_program(self):
        # temp_dir created because log file outputs to wherever fasta_to_pir.py is
        temp_dir = filesnpaths.get_temp_directory_path()
        self.copy_script_to_directory('fasta_to_pir.py', add_to_scripts_dict=False, directory=temp_dir)
        test_script = J(temp_dir, 'fasta_to_pir.py')
        test_input = os.path.abspath(J(os.path.dirname(anvio.__file__), '../tests/sandbox/mock_data_for_structure/proteins.fa'))
        test_output = J(temp_dir, 'test_out')

        command = [self.executable,
                   test_script,
                   test_input,
                   test_output]

        # try and execute the command
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output, error = process.communicate()

        if process.returncode:
            # modeller has failed
            error = error.decode('utf-8').strip()

            is_licence_key_error = True if error.find('Invalid license key') > -1 else False
            if is_licence_key_error:
                # its a valid modeller program with no license key
                license_target_file = error.split('\n')[-1]
                raise ConfigError("You're making progress and anvi'o is proud of you! You just need to validate your MODELLER\
                                   with a license key (it's free). Please go to https://salilab.org/modeller/registration.html\
                                   to register for a new license. After you receive an e-mail with your key, please open '%s'\
                                   and replace the characters XXXXX with your own key. Save the file and try again. " % license_target_file)

            else:
                error = "\n" + "\n".join(error.split('\n'))
                print(terminal.c(error, color='red'))
                raise ConfigError("The executable you requested is called `%s`, but anvi'o doesn't agree with you that\
                                   it is a working MODELLER program. That was determined by running the command `%s`, which raised the\
                                   error seen above. If you want to specify a specific MODELLER program, you can specify it with\
                                   `--modeller-executable`."
                                       % (self.executable, " ".join(command)))

        # no error was raised. now check if output file exists
        try:
            filesnpaths.is_file_exists(test_output)
        except FilesNPathsError:
            raise ConfigError("The executable you requested is called `%s`, but anvi'o doesn't agree with you that\
                               it is a working MODELLER program. That was determined by running the command `%s`, which did not\
                               output the file expected. If you want to specify a specific MODELLER program, you can specify it with\
                               `--modeller-executable`." % (self.executable, " ".join(command)))
Example #19
0
    def run_stdin(self, sequences_list, debug=False):
        """Takes a list of tuples for sequences, performs MSA using famsa, returns a dict.

            >>> from anvio.drivers.famsa import FAMSA
            >>> f = FAMSA()
            >>> f.run_stdin([('seq1', 'ATCATCATCGA'), ('seq2', 'ATCGAGTCGAT')])
            {u'seq1': u'ATCATCATCGA-', u'seq2': u'ATCG-AGTCGAT'}

        """

        tmp_dir = filesnpaths.get_temp_directory_path()
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Running %s' % self.program_name, '%d seqeunces will be aligned' % len(sequences_list))
        self.run.info('Log file path', log_file_path)

        sequences_data = ''.join(['>%s\n%s\n' % (t[0], t[1]) for t in sequences_list])
        cmd_line = [self.program_name, 'STDIN', 'STDOUT']

        output = utils.run_command_STDIN(cmd_line, log_file_path, sequences_data)

        if output[0:5] != 'FAMSA' or output[-6:].strip() != "Done!":
            with open(log_file_path, "a") as log_file: log_file.write('# THIS IS THE OUTPUT YOU ARE LOOKING FOR:\n\n%s\n' % (output))
            raise ConfigError("Drivers::FAMSA: Something is worng :/ The output does not like the expected output\
                               for a proper FAMSA run. You can find the output in this log file: %s" % (log_file_path))

        alignments = {}

        # parse the output, and fill alignments
        defline, seq = None, None
        for line in [o for o in output.split('\n')[2:-2] if len(o)] + ['>']:
            if line.startswith('>'):
                if defline:
                    alignments[defline[1:]] = seq
                defline, seq = line, None
            else:
                if not seq:
                    seq = line
                else:
                    seq += line

        if not debug:
            shutil.rmtree(tmp_dir)

        return alignments
Example #20
0
    def run_stdin(self, sequences_list, debug=False):
        """Takes a list of tuples for sequences, performs MSA using muscle, returns a dict.

            >>> from anvio.drivers.muscle import Muscle
            >>> m = Muscle()
            >>> m.run_stdin([('seq1', 'ATCATCATCGA'), ('seq2', 'ATCGAGTCGAT')])
            {u'seq1': u'ATCATCATCGA-', u'seq2': u'ATCG-AGTCGAT'}

        """

        tmp_dir = filesnpaths.get_temp_directory_path()
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Running %s' % self.program_name, '%d seqeunces will be aligned' % len(sequences_list))
        self.run.info('Log file path', log_file_path)

        sequences_data = ''.join(['>%s\n%s\n' % (t[0], t[1]) for t in sequences_list])
        cmd_line = [self.program_name, '-quiet']

        output = utils.run_command_STDIN(cmd_line, log_file_path, sequences_data)

        if not (len(output) and output[0] == '>'):
            with open(log_file_path, "a") as log_file: log_file.write('# THIS IS THE OUTPUT YOU ARE LOOKING FOR:\n\n%s\n' % (output))
            raise ConfigError("Drivers::Muscle: Something went wrong with this alignment that was working on %d\
                               sequences :/ You can find the output in this log file: %s" % (len(sequences_list), log_file_path))

        alignments = {}

        # parse the output, and fill alignments
        defline, seq = None, None
        for line in [o for o in output.split('\n') if len(o)] + ['>']:
            if line.startswith('>'):
                if defline:
                    alignments[defline[1:]] = seq
                defline, seq = line, None
            else:
                if not seq:
                    seq = line
                else:
                    seq += line

        if not debug:
            shutil.rmtree(tmp_dir)

        return alignments
Example #21
0
    def export_contigs_in_db_into_FASTA_file(self):
        if self.db_type != 'annotation':
            return None

        database = db.DB(self.db_path, self.version)
        contig_sequences_table = database.get_table_as_dict(t.contig_sequences_table_name)
        database.disconnect()

        self.progress.new('Exporting contigs into a FASTA file')
        self.progress.update('...')
        contigs_fasta_path = os.path.join(filesnpaths.get_temp_directory_path(), 'contigs.fa')
        contigs_fasta = u.FastaOutput(contigs_fasta_path)
        for contig in contig_sequences_table:
            contigs_fasta.write_id(contig)
            contigs_fasta.write_seq(contig_sequences_table[contig]['sequence'], split=False)

        self.progress.end()
        self.run.info('FASTA for contigs', contigs_fasta_path)

        return contigs_fasta_path
Example #22
0
    def export_contigs_in_db_into_FASTA_file(self):
        if self.db_type != 'contigs':
            return None

        database = db.DB(self.db_path, self.version)
        contig_sequences_table = database.get_table_as_dict(t.contig_sequences_table_name)
        database.disconnect()

        self.progress.new('Exporting contigs into a FASTA file')
        self.progress.update('...')
        contigs_fasta_path = os.path.join(filesnpaths.get_temp_directory_path(), 'contigs.fa')
        contigs_fasta = u.FastaOutput(contigs_fasta_path)
        for contig in contig_sequences_table:
            contigs_fasta.write_id(contig)
            contigs_fasta.write_seq(contig_sequences_table[contig]['sequence'], split=False)

        self.progress.end()
        self.run.info('FASTA for contigs', contigs_fasta_path)

        return contigs_fasta_path
Example #23
0
    def export_sequences_table_in_db_into_FASTA_file(self, table = t.contig_sequences_table_name, output_file_path = None):
        if self.db_type != 'contigs':
            return None

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(filesnpaths.get_temp_directory_path(), 'sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError, 'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' % (table)

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError, "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if not len([sequences_table]):
            raise ConfigError, "There are no sequences to report in table '%s'." % (table)

        self.progress.new('Exporting %d sequences into a FASTA file' % len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        for seq_id in sequences_table:
            sequences_fasta.write_id(seq_id)
            sequences_fasta.write_seq(sequences_table[seq_id]['sequence'], split=False)

        self.progress.end()
        self.run.info('Sequences', '%d sequences reported.' % (len(sequences_table)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Example #24
0
def check_MODELLER(executable=None):
    """Test if MODELLER is going to work.

    Checks the executable exists, that a license exists, and can produce the expected output of a
    modeller executable. Exists outside of the class MODELLER so it does not have to be checked
    everytime the class is initialized. 

    Parameters
    ==========
    executable : str, None
        The string representation of a binary MODELLER program. E.g "mod9.21". If None,
        up_to_date_modeller_exec is chosen and tested.

    Returns
    =======
    executable : str
        Returns the executable that you _should_ use, which is not necessarily what is input
    """

    executable = executable if executable else up_to_date_modeller_exec

    scripts_folder = J(os.path.dirname(anvio.__file__),
                       'data/misc/MODELLER/scripts')
    if utils.filesnpaths.is_dir_empty(scripts_folder):
        raise ConfigError(
            "Anvi'o houses all its MODELLER scripts in %s, but your directory "
            "contains no scripts. Why you did dat?" % scripts_folder)

    try:
        utils.is_program_exists(executable)
    except ConfigError as e:
        *prefix, sub_version = up_to_date_modeller_exec.split('.')
        prefix, sub_version = ''.join(prefix), int(sub_version)
        for alternate_version in reversed(
                range(sub_version - 10, sub_version + 10)):
            alternate_program = prefix + '.' + str(alternate_version)
            if utils.is_program_exists(alternate_program, dont_raise=True):
                executable = alternate_program
                break
        else:
            raise ConfigError(
                "Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one "
                "(which can be done with `--modeller-executable`), so anvi'o tried the most recent version "
                "it knows about: '%s'. If you are certain you have it on your system (for instance you can run it "
                "by typing '%s' in your terminal window), you may want to send a detailed bug report. If you "
                "don't have it on your system, check out these installation instructions on our website: "
                "http://merenlab.org/2016/06/18/installing-third-party-software/#modeller"
                % (executable, executable))

    temp_dir = filesnpaths.get_temp_directory_path()
    shutil.copy2(J(scripts_folder, 'fasta_to_pir.py'), temp_dir)

    test_script = J(temp_dir, 'fasta_to_pir.py')
    test_input = J(os.path.dirname(anvio.__file__),
                   'tests/sandbox/mock_data_for_structure/proteins.fa')
    test_output = J(temp_dir, 'test_out')

    command = [executable, test_script, test_input, test_output]

    # try and execute the command
    process = subprocess.Popen(command,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()

    if process.returncode:
        # modeller has failed
        error = error.decode('utf-8').strip()

        is_licence_key_error = True if error.find(
            'Invalid license key') > -1 else False
        if is_licence_key_error:
            # its a valid modeller program with no license key
            license_target_file = error.split('\n')[-1]
            raise ConfigError(
                "You're making progress and anvi'o is proud of you! You just need to validate your MODELLER "
                "with a license key (it's free). Please go to https://salilab.org/modeller/registration.html "
                "to register for a new license. After you receive an e-mail with your key, please open '%s' "
                "and replace the characters XXXXX with your own key. Save the file and try again. "
                % license_target_file)

        else:
            error = "\n" + "\n".join(error.split('\n'))
            print(terminal.c(error, color='red'))
            raise ConfigError(
                "The executable you requested is called `%s`, but anvi'o doesn't agree with you that "
                "it is a working MODELLER program. That was determined by running the command `%s`, which raised the "
                "error seen above. If you want to specify a specific MODELLER program, you can specify it with "
                "`--modeller-executable`." % (executable, " ".join(command)))

    # no error was raised. now check if output file exists
    try:
        filesnpaths.is_file_exists(test_output)
    except FilesNPathsError:
        raise ConfigError(
            "The executable you requested is called `%s`, but anvi'o doesn't agree with you that "
            "it is a working MODELLER program. That was determined by running the command `%s`, which did not "
            "output the file expected. If you want to specify a specific MODELLER program, you can specify it with "
            "`--modeller-executable`." % (executable, " ".join(command)))

    return executable
Example #25
0
    def agglomerate(self, max_mismatch_freq=0, priority_function=None):
        """Agglomerate sequences by aligning all to all and then remapping alignments to seed references.

        Sets the attributes, `agglom_aligned_query_dict` and `agglom_aligned_ref_dict`:
        the former maps each input sequence name to a `sequence.AlignedQuery` object
        and the latter maps input sequence names corresponding to cluster seed sequences to `sequence.AlignedReference` objects.
        AlignedQuery and AlignedReference objects each contain a list of Alignment objects which relate queries to references.

        Parameters
        ==========
        max_mismatch_freq : float, 0
            The maximum mismatch frequency, lying in the interval [0, 1), allowed in alignments.
            The higher the value, the larger the agglomerated clusters.

        priority_function : function reference, None
            The priority function should map the input sequences to a numeric rank of how clusters will be seeded.
            By default, the priority function ranks in descending order of sequence length,
            then descending order of number of alignments, then in ascending order of sequence name.
            (The longest sequence with the most alignments is the first to seed a cluster.)
        """
        progress = terminal.Progress()
        progress.new("Agglomerating")
        progress.update("Writing FASTA file of sequences")
        temp_dir_path = filesnpaths.get_temp_directory_path()
        fasta_path = os.path.join(temp_dir_path, 'seqs.fa')
        seq_dict = {}
        with open(fasta_path, 'w') as fasta_file:
            for name, seq_string in zip(self.seq_names, self.seq_strings):
                fasta_file.write(f">{name}\n{seq_string}\n")
                seq_dict[name] = seq_string
        progress.end()

        align_df = Vmatch(argparse.Namespace(match_mode='query_substring_with_mismatches',
                                             fasta_db_file=fasta_path,
                                             fasta_query_file=fasta_path,
                                             num_threads=self.num_threads,
                                             max_hamming_dist=math.ceil(max(map(len, self.seq_strings)) * max_mismatch_freq),
                                             min_ident=int(100 - 100 * max_mismatch_freq),
                                             align_output_length=10,
                                             temp_dir=temp_dir_path)).search_queries()

        pid = "Parsing alignments"
        progress.new(pid)
        # The dictionary of aligned queries is named `agglom_aligned_query_dict` to indicate that
        # its AlignedQuery objects are modified during agglomeration.
        agglom_aligned_query_dict = {}
        aligned_ref_dict = {}
        num_processed_aligns = -1
        parsing_progress_interval = 10000
        total_align_count = len(align_df)
        pp_total_align_count = pp(total_align_count)
        for query_name, query_align_df in align_df.groupby('query_name'):
            query_seq_string = seq_dict[query_name]
            query_length = len(query_seq_string)
            aligned_query = AlignedQuery(query_seq_string, query_name)
            agglom_aligned_query_dict[query_name] = aligned_query
            for target_name, query_start_in_target, mismatch_positions in zip(query_align_df['target_name'],
                                                                              query_align_df['query_start_in_target'],
                                                                              query_align_df['mismatch_positions']):
                num_processed_aligns += 1
                if num_processed_aligns % parsing_progress_interval == 0:
                    pp_progress_interval_end = pp(total_align_count if num_processed_aligns + parsing_progress_interval > total_align_count else num_processed_aligns + parsing_progress_interval)
                    progress.update_pid(pid)
                    progress.update(f"{pp(num_processed_aligns + 1)}-{pp_progress_interval_end}/{pp_total_align_count}")

                try:
                    aligned_target = aligned_ref_dict[target_name]
                except KeyError:
                    aligned_target = AlignedTarget(seq_dict[target_name], target_name)
                    aligned_ref_dict[target_name] = aligned_target

                # Convert the positions of mismatches into a cigar tuple for the alignment. The
                # search method ensured that each alignment contains at least one mismatch.
                cigartuples = []
                prev_mismatch_pos = -2
                for mismatch_num, mismatch_pos in enumerate(map(int, mismatch_positions.split(','))):
                    if prev_mismatch_pos == -2:
                        # This is the first mismatch in the alignment.
                        if mismatch_pos > 0:
                            # There is not a mismatch at the first position of the query.
                            cigartuples.append((7, mismatch_pos))
                        cigartuples.append((8, 1))
                    elif mismatch_pos == prev_mismatch_pos + 1:
                        # This mismatch follows another mismatch.
                        cigartuples[-1] = (8, cigartuples[-1][1] + 1)
                    else:
                        cigartuples.append((7, mismatch_pos - prev_mismatch_pos - 1))
                        cigartuples.append((8, 1))
                    prev_mismatch_pos = mismatch_pos
                if query_length - prev_mismatch_pos > 1:
                    cigartuples.append((7, query_length - prev_mismatch_pos - 1))

                alignment = Alignment(0, query_start_in_target, cigartuples, aligned_query, aligned_target)
                # The Alignment doesn't need to be added to the AlignedQuery object, as these are
                # changed later when queries are remapped.
                aligned_target.alignments.append(alignment)
        del seq_dict
        gc.collect()
        progress.end()

        pid = "Agglomerating aligned reference seqs"
        progress.new(pid)
        progress.update("...")

        if priority_function is None:
            priority_function = lambda aligned_ref: (-len(aligned_ref.seq_string),
                                                     -len(aligned_ref.alignments),
                                                     aligned_ref.name)

        for agglom_aligned_query in agglom_aligned_query_dict.values():
            agglom_aligned_query.alignments = []

        # Agglomerated clusters should preferentially be seeded
        # by the longest reference sequences with the most alignments.
        ordered_ref_names = [aligned_ref.name for aligned_ref
                             in sorted(aligned_ref_dict.values(), key=priority_function)]
        ordered_ref_inputs = [(name, i) for i, name in enumerate(ordered_ref_names)]

        # This dict is used to track which sequences have been agglomerated.
        # Keys are sequence names; values are priority rank.
        # When a sequence is agglomerated, either as the reference seed of a cluster or a member,
        # this dict is updated with the priority of its reference seed
        # if the priority is lower (stronger) than the existing priority for the sequence in the dict.
        processed_ref_dict = {name: len(ordered_ref_names) for name in ordered_ref_names}

        agglom_aligned_refs = []
        agglom_progress_interval = 1000
        total_ref_count = len(ordered_ref_inputs)
        pp_total_ref_count = pp(total_ref_count)
        num_processed_refs = -1
        for agglom_ref_priority, name in enumerate(ordered_ref_names):
            num_processed_refs += 1
            if num_processed_refs % agglom_progress_interval == 0:
                pp_progress_interval_end = pp(total_ref_count if num_processed_refs + agglom_progress_interval > total_ref_count else num_processed_refs + agglom_progress_interval)
                progress.update_pid(pid)
                progress.update(f"{pp(num_processed_refs + 1)}-{pp_progress_interval_end}/{pp_total_ref_count}")

            if agglom_ref_priority >= processed_ref_dict[name]:
                # The reference sequence has already been processed,
                # as it mapped to another reference sequence that had been processed.
                continue

            processed_ref_dict[name] = agglom_ref_priority

            aligned_ref = aligned_ref_dict[name]
            agglom_aligned_ref = AlignedTarget(aligned_ref.seq_string, name=name)

            # Track the references agglomerated with this seed.
            presently_processed_ref_names = [name]

            remapping_stack = deque()
            remapping_stack.append((name, aligned_ref, 0, {}, []))

            while remapping_stack:
                remapping_item = remapping_stack.pop()
                current_ref_name = remapping_item[0]
                current_aligned_ref = remapping_item[1]
                # Record mismatches between query sequences and the agglomerated reference sequence,
                # with the coordinate system being nucleotide positions in the agglomerated reference.
                current_ref_start_in_agglom_ref = remapping_item[2]
                agglom_ref_mismatch_dict = remapping_item[3]
                current_ref_mismatches_to_agglom_ref = remapping_item[4]

                next_remapping_items = []
                for alignment in current_aligned_ref.alignments:
                    alignment_length = alignment.alignment_length
                    aligned_query = alignment.aligned_query
                    query_name = aligned_query.name
                    query_seq_string = aligned_query.seq_string

                    if query_name in presently_processed_ref_names:
                        # The query has already been agglomerated with this seed,
                        # as it mapped to another agglomerated sequence.
                        continue

                    try:
                        prev_ref_priority = processed_ref_dict[query_name]
                    except KeyError:
                        # No sequences aligned to the query sequence.
                        continue

                    # In the next iteration, the current query sequence will be investigated as a reference.
                    presently_processed_ref_names.append(query_name)
                    if agglom_ref_priority < prev_ref_priority:
                        processed_ref_dict[query_name] = agglom_ref_priority

                    # Get the mismatches between the query and the current reference,
                    # with the coordinate system being nucleotide positions in the current reference.
                    query_mismatches_to_current_ref_in_alignment_frame = []
                    current_ref_seq_string = alignment.aligned_target.seq_string
                    alignment_start_in_current_ref = alignment.target_start
                    current_ref_pos = alignment_start_in_current_ref
                    for cigartuple in alignment.cigartuples:
                        if cigartuple[0] == 8:
                            for incremental_pos in range(cigartuple[1]):
                                mismatch_pos = current_ref_pos + incremental_pos
                                current_ref_nt = current_ref_seq_string[mismatch_pos]
                                query_mismatches_to_current_ref_in_alignment_frame.append(
                                    (mismatch_pos - alignment_start_in_current_ref, current_ref_nt)
                                )
                        current_ref_pos += cigartuple[1]

                    # Position of the alignment in the coordinate system of the agglomerated reference sequence
                    alignment_start_in_agglom_ref = current_ref_start_in_agglom_ref + alignment_start_in_current_ref
                    alignment_end_in_agglom_ref = alignment_start_in_agglom_ref + alignment_length

                    # Record mismatches between the query and current reference
                    # that are also mismatches between the query and agglomerated reference.
                    # When a new mismatch with the agglomerated reference is encountered,
                    # it is added to the dict of all agglomerated reference mismatches.
                    query_mismatches_to_agglom_ref = []
                    query_mismatches_to_agglom_ref_in_alignment_frame = []
                    for alignment_pos, current_ref_nt in query_mismatches_to_current_ref_in_alignment_frame:
                        agglom_ref_pos = alignment_pos + alignment_start_in_agglom_ref
                        agglom_ref_nt = agglom_ref_mismatch_dict.get(agglom_ref_pos)
                        if agglom_ref_nt:
                            query_nt = query_seq_string[alignment_pos]
                            if agglom_ref_nt == query_nt:
                                continue
                        else:
                            # This nucleotide in the agglomerated reference sequence has matched all other aligned sequences thus far.
                            agglom_ref_nt = current_ref_nt
                            agglom_ref_mismatch_dict[agglom_ref_pos] = agglom_ref_nt
                        query_mismatches_to_agglom_ref.append((agglom_ref_pos, agglom_ref_nt))
                        query_mismatches_to_agglom_ref_in_alignment_frame.append((alignment_pos, agglom_ref_nt))

                    # Record mismatches between the query and agglomerated reference
                    # at positions where the query matches the current reference.
                    query_mismatch_to_current_ref_in_agglom_ref_frame_positions = [
                        alignment_pos + alignment_start_in_agglom_ref
                        for alignment_pos, _ in query_mismatches_to_current_ref_in_alignment_frame
                    ]
                    for agglom_ref_pos, agglom_ref_nt in current_ref_mismatches_to_agglom_ref:
                        if agglom_ref_pos in query_mismatch_to_current_ref_in_agglom_ref_frame_positions:
                            # The mismatch position has already been considered,
                            # as there is a mismatch between the query and current reference at this position as well.
                            continue
                        if alignment_start_in_agglom_ref <= agglom_ref_pos < alignment_end_in_agglom_ref:
                            # Only consider mismatches within the bounds of the alignment between the query and current reference.
                            query_mismatches_to_agglom_ref.append((agglom_ref_pos, agglom_ref_nt))
                            query_mismatches_to_agglom_ref_in_alignment_frame.append(
                                (agglom_ref_pos - alignment_start_in_agglom_ref, agglom_ref_nt)
                            )

                    # Change the properties of the alignment to reflect remapping to the agglomerated reference.
                    cigartuples = []
                    # Sort mismatches by position.
                    query_mismatches_to_agglom_ref_in_alignment_frame.sort(key=lambda query_mismatch_item: query_mismatch_item[0])
                    prev_alignment_pos = -1
                    prev_agglom_ref_nt = ''
                    for alignment_pos, agglom_ref_nt in query_mismatches_to_agglom_ref_in_alignment_frame:
                        if alignment_pos > prev_alignment_pos + 1:
                            cigartuples.append((7, alignment_pos - prev_alignment_pos - 1))
                        if cigartuples:
                            if cigartuples[-1][0] == 8:
                                cigartuples[-1] = (8, cigartuples[-1][1] + 1)
                            else:
                                cigartuples.append((8, 1))
                        else:
                            cigartuples.append((8, 1))
                        prev_alignment_pos = alignment_pos
                        prev_agglom_ref_nt = agglom_ref_nt
                    if alignment_length > prev_alignment_pos + 1:
                        cigartuples.append((7, alignment_length - prev_alignment_pos - 1))

                    agglom_aligned_query = agglom_aligned_query_dict[query_name]
                    agglom_alignment = Alignment(alignment.query_start,
                                                 alignment_start_in_agglom_ref,
                                                 cigartuples,
                                                 aligned_query=agglom_aligned_query,
                                                 aligned_target=agglom_aligned_ref)
                    agglom_aligned_query.alignments.append(agglom_alignment)
                    agglom_aligned_ref.alignments.append(agglom_alignment)

                    next_remapping_items.append(
                        (query_name,
                         aligned_ref_dict[query_name],
                         alignment_start_in_agglom_ref,
                         dict(agglom_ref_mismatch_dict.items()),
                         [mismatch_tuple for mismatch_tuple in query_mismatches_to_agglom_ref])
                    )

                for next_remapping_item in next_remapping_items[::-1]:
                    remapping_stack.append(next_remapping_item)

            agglom_aligned_refs.append(agglom_aligned_ref)
        agglom_aligned_ref_dict = {ref.name: ref for ref in agglom_aligned_refs}

        self.agglom_aligned_query_dict = agglom_aligned_query_dict
        self.agglom_aligned_ref_dict = agglom_aligned_ref_dict
        progress.end()
Example #26
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\
                               seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\
                               find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError(
                "Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                               database formatted under the COGs data directory for that program :/ You may need to\
                               re-run the COGs setup (anvi-setup-ncbi-cogs), UNLESS, you set up your COG data directory \
                               somewhere else than what anvi'o attempts to use at the moment ('%s'). If that is the case, \
                               this may be the best time to point the right directory using the --cog-data-dir parameter, \
                               or the environmental variable 'ANVIO_COG_DATA_DIR'."
                % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError(
                "You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                               sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError(
                "You can't provide both an AA sequences file and a contigs database. Choose one!"
            )

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning(
                "Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later."
            )

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run',
                      self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](
            aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(
            search_results_tabular,
            target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Example #27
0
    def run_hmmscan(self,
                    source,
                    genes_in_model,
                    hmm,
                    ref,
                    cut_off_flag="--cut_ga"):
        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' %
                    (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        ret_val = utils.run_command(cmd_line)
        if ret_val:
            raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html')
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \
                                        % (self.hmm_scan_output,
                                           cut_off_flag,
                                           self.num_threads_to_use,
                                           self.hmm_scan_hits_shitty,
                                           hmm_file_path,
                                           self.protein_sequences_fasta,
                                           log_file_path))

        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #28
0
    def export_sequences_table_in_db_into_FASTA_file(
        self,
        table=t.contig_sequences_table_name,
        output_file_path=None,
        item_names=set([])):
        '''Exports a sequence table from the contigs database.

            - t.contig_sequences_table_name: contig sequences (where item_names are contig names)
            - t.gene_amino_acid_sequences_table_name: amino acid sequences for gene calls (item_names are gene caller ids)


          If `item_names` are specified, only those sequences with matching ids to something in this set will be reported.
          '''

        if self.db_type != 'contigs':
            return None

        if not isinstance(item_names, set):
            raise ConfigError("`item_names` must be of type `set`")

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if len(item_names):
            total_num_items_in_db = len(sequences_table)
            item_names_to_remove = set(list(
                sequences_table.keys())).difference(item_names)

            for item_name in item_names_to_remove:
                if item_name in item_names_to_remove:
                    sequences_table.pop(item_name)

            # who does this for their users:
            num_items_to_be_reported = len(sequences_table)
            optional_info = ("It turned out %d of the item ids you requested was actually in the database." \
                                    % len(sequences_table)) if num_items_to_be_reported != len(item_names) else ''

            if not self.quiet:
                self.run.info_single("You asked anvi'o to report only %d items from a database that contained %d. %s" \
                                            % (len(item_names), total_num_items_in_db, optional_info))

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        blank_seq_ids_not_reported = set([])

        for seq_id in sequences_table:
            if len(sequences_table[seq_id]['sequence']):
                sequences_fasta.write_id(seq_id)
                sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                          split=False)
            else:
                blank_seq_ids_not_reported.add(seq_id)

        self.progress.end()

        if len(blank_seq_ids_not_reported):
            self.run.warning(
                "%d entries in the sequences table had blank sequences :/ This is related to the issue\
                             at https://github.com/merenlab/anvio/issues/565. If this is like mid-2018 and you still\
                             get this warning, please find an anvi'o developer and make them feel embarrassed. If it\
                             is earlier than take this as a simple warning that some gene calls in your downstream\
                             analyses may have no sequences, and that's OK. This is a very minor issue due to on-the-fly\
                             addition of Ribosomal RNA gene calls to the contigs database, and will likely will not\
                             affect anything major. This warning will go away when anvi'o can seamlessly work with\
                             multiple gene callers (which we are looking forward to implement in the future)."
                % len(blank_seq_ids_not_reported))

        self.run.info(
            'Sequences', '%d sequences reported.' %
            (len(sequences_table) - len(blank_seq_ids_not_reported)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Example #29
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an\
                                   HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run\
                                   HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal\
                                   RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter\
                                   '--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But because we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #30
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')
        cmd_line = ('hmmscan -o "%s" %s --tblout "%s" "%s" "%s" >> "%s" 2>&1' % (self.hmm_scan_output,
                                                                              cut_off_flag,
                                                                              self.hmm_scan_hits_shitty,
                                                                              hmm_file_path,
                                                                              self.proteins_in_contigs,
                                                                              log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #31
0
File: hmmer.py Project: meren/anvio
    def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target)

        self.run.warning('', header='HMM Profiling for %s' % source, lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\
                               installed is either not up-to-date enough, or too new :/ Just to make sure what went\
                               wrong please take a look at the log file ('%s'). Please visit %s to see what\
                               is the latest version availalbe if you think updating HMMER can resolve it. You can\
                               learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan',
                    '-o', self.hmm_scan_output, *noise_cutoff_terms.split(),
                    '--cpu', self.num_threads_to_use,
                    '--tblout', self.hmm_scan_hits_shitty,
                    hmm_file_path, self.target_files_dict[target]]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            self.progress.end()
            raise ConfigError("Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        
        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue
            
                parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        
        parseable_output.close()

        if detected_non_ascii:
            self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \
                the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\
                You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % 
                (self.hmm_scan_hits_shitty, ", ".join(map(str, lines_with_non_ascii))))

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #32
0
    def export_sequences_table_in_db_into_FASTA_file(
            self, table=t.contig_sequences_table_name, output_file_path=None):
        if self.db_type != 'contigs':
            return None

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        seq_ids_not_reported = set([])

        for seq_id in sequences_table:
            if len(sequences_table[seq_id]['sequence']):
                sequences_fasta.write_id(seq_id)
                sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                          split=False)
            else:
                seq_ids_not_reported.add(seq_id)

        self.progress.end()

        if len(seq_ids_not_reported):
            self.run.warning(
                "%d entries in the sequences table had blank sequences :/ This is related to the issue\
                             at https://github.com/merenlab/anvio/issues/565. If this is like mid-2018 and you still\
                             get this warning, please find an anvi'o developer and make them feel embarrassed. If it\
                             is earlier than take this as a simple warning that some gene calls in your downstream\
                             analyses may have no sequences, and that's OK. This is a very minor issue due to on-the-fly\
                             addition of Ribosomal RNA gene calls to the contigs database, and will likely will not\
                             affect anything major. This warning will go away when anvi'o can seamlessly work with\
                             multiple gene callers (which we are looking forward to implement in the future)."
                % len(seq_ids_not_reported))

        self.run.info(
            'Sequences', '%d sequences reported.' %
            (len(sequences_table) - len(seq_ids_not_reported)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Example #33
0
    def create_search_databases(self):
        """Creates all the search databases"""

        self.progress.new("Creating search databases")
        self.progress.update(
            "Removing any database that still exists in the output directory..."
        )
        for prefix in ['.nhr', '.nin', '.nsq']:
            [
                os.remove(database_path) for database_path in
                [s['db'] + prefix for s in self.ctx.anticodons.values()]
                if os.path.exists(database_path)
            ]

        # compresssing and decompressing FASTA files changes their hash and make them look like
        # modified in git. to avoid that, we will do the database generation in a temporary directory.
        temp_dir = filesnpaths.get_temp_directory_path()

        self.progress.update("Copying FASTA files to %s ..." % (temp_dir))
        # the following line basically returns a dictionary that shows the new path
        # of the FASTA file under temp_dir for a given anticodon .. apologies for the
        # incomprehensible list comprehension
        new_paths = dict([
            (os.path.basename(fasta_path),
             shutil.copy((fasta_path + '.gz'),
                         os.path.join(temp_dir,
                                      os.path.basename(fasta_path) + '.gz')))
            for fasta_path in [s['db'] for s in self.ctx.anticodons.values()]
        ])

        missing_FASTA_files = [
            anticodon for anticodon in self.ctx.anticodons
            if not os.path.exists(new_paths[anticodon])
        ]
        if len(missing_FASTA_files):
            raise ConfigError(
                "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this "
                "can't be your fault, it is not easy to advice what could be the solution to this. If you are not "
                "an anvi'o programmer working on this problem this very moment, please get in touch with one."
            )

        self.progress.update("Decompressing FASTA files in %s" % (temp_dir))
        new_paths = dict([(anticodon,
                           utils.gzip_decompress_file(new_paths[anticodon],
                                                      keep_original=False))
                          for anticodon in new_paths])

        for anticodon in self.ctx.anticodons:
            self.progress.update("Working on %s in %d threads" %
                                 (anticodon, self.num_threads))

            FASTA_file_path_for_anticodon = new_paths[anticodon]

            # create a BLAST search database for `FASTA_file_path_for_anticodon`
            blast = BLAST(query_fasta=FASTA_file_path_for_anticodon,
                          run=run_quiet,
                          progress=progress_quiet,
                          num_threads=self.num_threads)
            blast.log_file_path = os.path.join(
                os.path.dirname(FASTA_file_path_for_anticodon),
                '%s.log' % anticodon)
            blast.makedb(dbtype='nucl')

            for prefix in ['.nhr', '.nin', '.nsq']:
                if not os.path.exists(FASTA_file_path_for_anticodon + prefix):
                    raise ConfigError(
                        "Something went wrong and BLAST did not create the database file it was supposed to "
                        "for %s :(" % anticodon)
                else:
                    shutil.move(
                        FASTA_file_path_for_anticodon + prefix,
                        os.path.dirname(self.ctx.anticodons[anticodon]['db']))

        shutil.rmtree(temp_dir)

        self.progress.end()
        self.run.info_single(
            "Every FASTA is now turned into a fancy search database. It means you are now allowed to run "
            "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are "
            "caveats to it just like every other computational approach you use to make sense of complex 'omics "
            "data. To better understand those caveats you should read our online documentation a bit. If you see "
            "things that concerns you, please let anvi'o developers know. They love bad news. If you get good "
            "results from this workflow, thank to those who contributed to the GTDB.",
            nl_after=1,
            mc="green")
Example #34
0
    def find(self,
             sequence,
             sequence_name="(a sequence does not have a name)",
             display_palindromes=False):
        """Find palindromes in a single sequence, and populate `self.palindromes`

        The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes`
        for example usage.
        """

        if sequence_name in self.palindromes:
            raise ConfigError(
                f"The sequence '{sequence_name}' is already in `self.palindromes`."
            )
        else:
            self.palindromes[sequence_name] = []

        sequence = sequence.upper()
        sequence_length = len(sequence)

        if sequence_length < self.min_palindrome_length * 2 + self.min_distance:
            self.progress.reset()
            self.run.warning(
                f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short "
                f"to find palindromes that are at least {self.min_palindrome_length} nts, with "
                f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it."
            )

        # setup BLAST job
        BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path()
        fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa')
        log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt')
        results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml')
        with open(fasta_file_path, 'w') as fasta_file:
            fasta_file.write(f'>sequence\n{sequence}\n')

        # run blast
        blast = BLAST(fasta_file_path,
                      search_program='blastn',
                      run=run_quiet,
                      progress=progress_quiet)
        blast.evalue = 10
        blast.num_threads = self.num_threads
        blast.min_pct_id = 100 - self.max_num_mismatches
        blast.search_output_path = results_file_path
        blast.log_file_path = log_file_path
        blast.makedb(dbtype='nucl')

        if self.min_palindrome_length < 20 and len(
                sequence
        ) > 10000 and not self.user_is_warned_for_potential_performance_issues:
            self.progress.reset()
            self.run.warning(
                f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} "
                f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time "
                f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter "
                f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). "
                f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not "
                f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your "
                f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).",
                header="ONE-TIME PERFORMANCE WARNING")
            self.user_is_warned_for_potential_performance_issues = True

        blast.blast(outputfmt='5',
                    word_size=self.blast_word_size,
                    strand='minus')

        # parse the BLAST XML output
        root = ET.parse(blast.search_output_path).getroot()
        for query_sequence_xml in root.findall(
                'BlastOutput_iterations/Iteration'):
            for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'):

                for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'):
                    p = Palindrome(run=self.run)

                    p.sequence_name = sequence_name
                    p.first_start = int(
                        hsp_xml.find('Hsp_query-from').text) - 1
                    p.first_end = int(hsp_xml.find('Hsp_query-to').text)
                    p.first_sequence = hsp_xml.find('Hsp_qseq').text
                    p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1
                    p.second_end = int(hsp_xml.find('Hsp_hit-from').text)
                    p.second_sequence = hsp_xml.find('Hsp_hseq').text
                    p.distance = p.second_start - p.first_start

                    # for each hit, there will be a copy of its reverse complement.
                    # the first half of the if statement below is to control for that
                    # and make sure we keep only one of them. the other half is to
                    # remove those that do not meet the minimum distance criterion.
                    if p.distance < 0 or p.distance < self.min_distance:
                        continue

                    # before we continue, we will test for a special case: internal palindromes
                    # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM.
                    if p.distance == 0:
                        internal_palindrome = False
                        for _p in self.palindromes[sequence_name]:
                            if p.first_start > _p.first_start and p.first_start < _p.first_end:
                                internal_palindrome = True
                                break

                        if internal_palindrome:
                            continue

                    p.length = int(hsp_xml.find('Hsp_align-len').text)

                    if p.length < self.min_palindrome_length:
                        # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye:
                        continue

                    p.num_gaps = int(hsp_xml.find('Hsp_gaps').text)
                    p.num_mismatches = int(
                        hsp_xml.find('Hsp_align-len').text) - int(
                            hsp_xml.find('Hsp_identity').text)
                    p.midline = ''.join([
                        '|'
                        if p.first_sequence[i] == p.second_sequence[i] else 'x'
                        for i in range(0, len(p.first_sequence))
                    ])

                    if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0:
                        # this is the crazy part: read the function docstring for `get_split_palindromes`.
                        # briefly, we conclude that there are too many mismatches in this match, we will
                        # try and see if there is anything we can salvage from it.
                        p_list = self.get_split_palindromes(
                            p, display_palindromes=display_palindromes)
                    else:
                        # there aren't too many mismatches, and the length checks out. we will continue
                        # processing this hit as a sole palindrome
                        p_list = [p]

                    for sp in p_list:
                        if anvio.DEBUG or display_palindromes or self.verbose:
                            self.progress.reset()
                            sp.display()

                        self.palindromes[sequence_name].append(sp)

        # clean after yourself
        if anvio.DEBUG:
            self.run.info("BLAST temporary dir kept",
                          BLAST_search_tmp_dir,
                          nl_before=1,
                          mc='red')
        else:
            filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)
Example #35
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args)

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know."
                    )
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(
                source, alphabet, context, kind_of_search, domain,
                all_genes_searched_against, hmm_model, reference,
                noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt,
                                                             alphabet=alphabet,
                                                             context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                self.run.warning(
                    "Alright! You just called an HMM profile that runs on contigs. Because it is not\
                                 working with anvi'o gene calls directly, the resulting hits will need to be added\
                                 as 'new gene calls' into the contigs database. This is a new feature, and if it\
                                 starts screwing things up for you please let us know. Other than that you're pretty\
                                 much golden. Carry on.",
                    header="Psst. Your fancy HMM profile '%s' speaking" %
                    source,
                    lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search, search_results_dict)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #36
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        if not hmm_hits_file:
            run.info_single(
                "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But "
                "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories "
                "and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})
            return

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id':
                hmm_hit['gene_callers_id'],
                'source':
                'Pfam',
                'accession':
                hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(
                    hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True),
                'e_value':
                hmm_hit['e_value'],
            }

            counter += 1

        if functions_dict:
            gene_function_calls_table.create(functions_dict)
        else:
            self.run.warning(
                "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as "
                "a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up "
                "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would '
                'like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #37
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But blecause we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #38
0
    def process(self):
        """Runs InteracDome."""

        tmp_directory_path = filesnpaths.get_temp_directory_path()
        gene_caller_ids = list(self.contigs_db.genes_in_contigs_dict.keys())

        self.run.info("num genes that HMM will be run on",
                      len(gene_caller_ids))

        # export AA sequences for genes
        target_files_dict = {
            'AA:DOMAIN': os.path.join(tmp_directory_path,
                                      'AA_gene_sequences.fa')
        }
        self.contigs_db.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=gene_caller_ids,
            output_file_path=target_files_dict['AA:DOMAIN'],
            simple_headers=True,
            report_aa_sequences=True)

        # run hmmer
        hmmer = HMMer(target_files_dict,
                      num_threads_to_use=self.num_threads,
                      program_to_use=self.hmm_program)
        hmm_hits_file, domain_hits_file = hmmer.run_hmmer(
            source='InteracDome',
            alphabet='AA',
            context='DOMAIN',
            kind=None,
            domain=None,
            num_genes_in_model=len(self.function_catalog),
            hmm=self.hmm_filepath,
            ref=None,
            noise_cutoff_terms='--cut_ga',
            desired_output=('standard', 'domtable'),
        )

        self.run.warning("", header='HMMER results', lc='green')
        self.hmm_out = parser_modules['search']['hmmer_std_output'](
            hmm_hits_file, context='interacdome')

        self.run.info('num total domain hits', self.hmm_out.dom_hits.shape[0])
        self.run.info(
            'num unique genes',
            self.hmm_out.dom_hits['corresponding_gene_call'].unique().shape[0])
        self.run.info('num unique HMMs',
                      self.hmm_out.dom_hits['pfam_id'].unique().shape[0])

        if self.hmm_out.dom_hits.shape[0] == 0:
            self.run.info_single(
                "The HMM search returned no hits :/ So there is nothing to do. Anvi'o "
                "will now clean the temporary directories and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            return

        self.filter_hits()
        self.attribute_binding_frequencies()
        self.filter_positions()

        self.bind_freq = self.bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])
        self.avg_bind_freq = self.avg_bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])

        if self.bind_freq.empty:
            self.run.warning(
                "There are 0 HMM hits, so there is nothing to do :( Binding frequencies were not "
                "added to your database",
                header="Oh no...")
        else:
            self.store()

        if anvio.DEBUG:
            self.run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to "
                "clean those up later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            self.run.info_single(
                "Cleaning up the temp directory (you can use `--debug` if you would "
                "like to keep it for testing purposes)",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #39
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        ret_val = utils.run_command(cmd_line)
        if ret_val:
            raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html')
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \
                                        % (self.hmm_scan_output,
                                           cut_off_flag,
                                           self.num_threads_to_use,
                                           self.hmm_scan_hits_shitty,
                                           hmm_file_path,
                                           self.protein_sequences_fasta,
                                           log_file_path))

        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #40
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #41
0
    def export_sequences_table_in_db_into_FASTA_file(self, table=t.contig_sequences_table_name, output_file_path=None, item_names=set([])):
        '''Exports a sequence table from the contigs database.

            - t.contig_sequences_table_name: contig sequences (where item_names are contig names)
            - t.gene_amino_acid_sequences_table_name: amino acid sequences for gene calls (item_names are gene caller ids)


          If `item_names` are specified, only those sequences with matching ids to something in this set will be reported.
          '''

        if self.db_type != 'contigs':
            return None

        if not isinstance(item_names, set):
            raise ConfigError("`item_names` must be of type `set`")

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError('Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' % (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError("You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if len(item_names):
            total_num_items_in_db = len(sequences_table)
            item_names_to_remove = set(list(sequences_table.keys())).difference(item_names)

            for item_name in item_names_to_remove:
                if item_name in item_names_to_remove:
                    sequences_table.pop(item_name)

            # who does this for their users:
            num_items_to_be_reported = len(sequences_table)
            optional_info = ("It turned out %d of the item ids you requested was actually in the database." \
                                    % len(sequences_table)) if num_items_to_be_reported != len(item_names) else ''

            if not self.quiet:
                self.run.info_single("You asked anvi'o to report only %d items from a database that contained %d. %s" \
                                            % (len(item_names), total_num_items_in_db, optional_info))

        if not len([sequences_table]):
            raise ConfigError("There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' % len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        blank_seq_ids_not_reported = set([])

        for seq_id in sequences_table:
            if len(sequences_table[seq_id]['sequence']):
                sequences_fasta.write_id(seq_id)
                sequences_fasta.write_seq(sequences_table[seq_id]['sequence'], split=False)
            else:
                blank_seq_ids_not_reported.add(seq_id)

        self.progress.end()

        if len(blank_seq_ids_not_reported):
            self.run.warning("%d entries in the sequences table had blank sequences :/ This is related to the issue\
                             at https://github.com/merenlab/anvio/issues/565. If this is like mid-2020 and you still\
                             are getting this warning, please find an anvi'o developer and make them feel embarrassed.\
                             If it is earlier than that, then take this as a simple warning to remember that some gene\
                             calls in your downstream analyses may have no amino acid sequences, and that's actuall OK.\
                             This is a very minor issue due to on-the-fly addition of Ribosomal RNA gene calls to the\
                             contigs database, and will unlikely affect anything major. This warning will go away when\
                             anvi'o can seamlessly work with multiple gene callers (which we are looking forward to\
                             implement in the future)." % len(blank_seq_ids_not_reported))

        self.run.info('Sequences', '%d sequences reported.' % (len(sequences_table) - len(blank_seq_ids_not_reported)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Example #42
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        self.run.info("Contigs DB", self.db_path)
        self.run.info("HMM sources", ', '.join(sources.keys()))

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        have_hmm_sources_with_non_RNA_contig_context = False
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile PROFILE_NAME_HERE')." %
                    (context, alphabet))

            self.run.info('Alphabet/context target found',
                          '%s:%s' % (alphabet, context))

            if context == 'CONTIG' and alphabet != 'RNA':
                have_hmm_sources_with_non_RNA_contig_context = True

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.get_sequences_for_gene_callers_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        if have_hmm_sources_with_non_RNA_contig_context:
            # in that case, we should remind people what's up.
            self.run.warning(
                "The HMM profiles that are about to be run includes at least one HMM profile that runs on "
                "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o "
                "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' "
                "into the contigs database. So far so good. But because we are in the realm of contigs rather "
                "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are "
                "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your "
                "contigs database for these hits, it will NOT report amino acid sequences for the "
                "new gene calls that will emerge from these HMMs, expecting you to judge whether this will "
                "influence your pangenomic analyses or other things you thought you would be doing with the "
                "result of this HMM search downstream. If you do not feel like being the judge of anything today "
                "you can move on yet remember to remember this if things look somewhat weird later on.",
                header="THE MORE YOU KNOW 🌈",
                lc="green")

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            if alphabet in ['DNA', 'RNA'
                            ] and 'domtable' in self.hmmer_desired_output:
                raise ConfigError(
                    "Domain table output was requested (probably with the --get-domtable-output flag, "
                    "does that look familiar?) but unfortunately this option is incompatible with the "
                    f"current source of HMM profiles, {source}, because this source uses a nucleotide "
                    "alphabet.")

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmmer_output = commander.run_hmmer(
                source,
                alphabet,
                context,
                kind_of_search,
                domain,
                len(all_genes_searched_against),
                hmm_model,
                reference,
                noise_cutoff_terms,
                desired_output=self.hmmer_desired_output,
                hmmer_output_dir=self.hmmer_output_dir)

            if self.hmmer_output_dir:
                self.run.info("HMMER output directory", self.hmmer_output_dir)

            if not isinstance(hmmer_output, tuple):
                hmm_scan_hits_txt = hmmer_output
            else:
                hmm_scan_hits_txt, domain_hits_txt = hmmer_output
                self.run.info("Domain table output", domain_hits_txt)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                try:
                    parser = parser_modules['search']['hmmer_table_output'](
                        hmm_scan_hits_txt,
                        alphabet=alphabet,
                        context=context,
                        program=self.hmm_program)
                except StupidHMMError as e:
                    raise ConfigError(
                        f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. "
                        f"This error is typically due to contig names that are long and variable in length, which that "
                        f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, "
                        f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a "
                        f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. "
                        f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag "
                        f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to "
                        f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}."
                    )

                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.
                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)
Example #43
0
    def run_hmmscan(self,
                    source,
                    target,
                    kind,
                    domain,
                    genes_in_model,
                    hmm,
                    ref,
                    cut_off_flag="--cut_ga"):

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Target', target)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = [
            'hmmscan', '-o', self.hmm_scan_output, cut_off_flag, '--cpu',
            self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty,
            hmm_file_path, self.target_files_dict[target]
        ]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError(
                "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." %
                log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #44
0
    def run_hmmscan(self, source, alphabet, context, kind, domain,
                    num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\
                               installed is either not up-to-date enough, or too new :/ Just to make sure what went\
                               wrong please take a look at the log file ('%s'). Please visit %s to see what\
                               is the latest version availalbe if you think updating HMMER can resolve it. You can\
                               learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = [
            'nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o',
            self.hmm_scan_output, *noise_cutoff_terms.split(), '--cpu',
            self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty,
            hmm_file_path, self.target_files_dict[target]
        ]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." %
                log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')

        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue

                parseable_output.write('\t'.join(line.split()[0:18]) + '\n')

        parseable_output.close()

        if detected_non_ascii:
            self.run.warning(
                "Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \
                the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\
                You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"."
                % (self.hmm_scan_hits_shitty, ", ".join(
                    map(str, lines_with_non_ascii))))

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Example #45
0
    def process(self):
        """
        """
        # will be empty if all sources in self.annotation_sources_info have "skip": True
        residue_annotation_methods = [
            info["method"] for _, info in self.annotation_sources_info.items()
            if not info["skip"]
        ]

        # which genes had structures and which did not. this information is added to the structure database self table
        has_structure = {True: [], False: []}

        num_genes_tried = 0
        num_genes_to_try = len(self.genes_of_interest)

        for corresponding_gene_call in self.genes_of_interest:
            # MODELLER outputs a lot of stuff into its working directory. A temporary directory is
            # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this
            # directory are used in the creation of the structure database. If self.full_modeller_output is
            # provided, these directories and their contents are moved into self.full_modeller_output.
            self.args.directory = filesnpaths.get_temp_directory_path()
            self.args.target_fasta_path = filesnpaths.get_temp_file_path()

            # Export sequence
            dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path,
                self.args.target_fasta_path,
                set([corresponding_gene_call]),
                quiet=True)

            # Model structure
            progress_title = 'Modelling gene ID %d; (%d of %d processed)' % (
                corresponding_gene_call, num_genes_tried, num_genes_to_try)
            modeller_out = self.run_modeller(corresponding_gene_call,
                                             progress_title)
            if modeller_out["structure_exists"]:
                self.run.info_single("Gene successfully modelled!",
                                     nl_after=1,
                                     mc="green")

            has_structure[modeller_out["structure_exists"]].append(
                str(corresponding_gene_call))

            # Annotate residues
            residue_info_dataframe = None
            if modeller_out["structure_exists"]:
                residue_info_dataframe = self.run_residue_annotation_for_gene(
                    residue_annotation_methods, corresponding_gene_call,
                    modeller_out["best_model_path"])
            # Append info to tables
            self.append_gene_info_to_tables(modeller_out,
                                            residue_info_dataframe)

            # Append metadata to self
            self.update_structure_database_meta_table(has_structure)

            if self.full_modeller_output:
                self.dump_results_to_full_output()

            num_genes_tried += 1

        if not has_structure[True]:
            raise ConfigError(
                "Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'("
            )

        self.structure_db.disconnect()