Exemple #1
0
    def hmmscan_worker(self, part_file, cmd_line, shitty_output_file, log_file, merged_file_buffer, buffer_write_lock):
        utils.run_command(cmd_line, log_file)

        if not os.path.exists(shitty_output_file):
            self.progress.end()
            raise ConfigError("Something went wrong with hmmscan and it failed to generate the expected output :/ Fortunately "
                              "we have this log file which should clarify the problem: '%s'. Please do not forget to include this "
                              "file in your question if you were to seek help from the community." % log_file)

        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(shitty_output_file, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue

                with buffer_write_lock:
                    merged_file_buffer.write('\t'.join(line.split()[0:18]) + '\n')

        if detected_non_ascii:
            self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing "
                             "the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s. "
                             "You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." %
                                                 (shitty_output_file, ", ".join(map(str, lines_with_non_ascii))))
Exemple #2
0
    def makedb(self, output_db_path=None, dbtype='prot'):
        if dbtype not in ['prot', 'nucl']:
            raise ConfigError(
                "The `makedb` function in `BLAST` does not know about dbtype '%s' :("
                % dbtype)

        self.progress.new('BLAST')
        self.progress.update(
            'creating the search database (using %d thread(s)) ...' %
            self.num_threads)

        cmd_line = [
            'makeblastdb', '-in', self.target_fasta, '-dbtype', dbtype, '-out',
            output_db_path or self.target_fasta
        ]

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        if dbtype == 'prot':
            expected_output = (output_db_path or self.target_fasta) + '.phr'
        elif dbtype == 'nucl':
            expected_output = (output_db_path or self.target_fasta) + '.nhr'

        self.check_output(expected_output, 'makeblastdb')

        self.run.info('blast makeblast cmd', cmd_line, quiet=True)
        self.run.info('BLAST search db', self.target_fasta)
Exemple #3
0
    def run_prodigal(self, fasta_file_path):
        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes')
        self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins')

        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.warning('', header = 'Finding ORFs in contigs', lc = 'green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Proteins', self.proteins_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')
        cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path,
                                                                               self.genes_in_contigs,
                                                                               self.proteins_in_contigs,
                                                                               log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.proteins_in_contigs):
            raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        return self.proteins_in_contigs
Exemple #4
0
    def hmmer_worker(self, partial_input_file, cmd_line, table_output_file,
                     standard_output_file, desired_output, log_file,
                     merged_files_dict):

        # First we run the command
        utils.run_command(cmd_line, log_file)

        if not os.path.exists(table_output_file) or not os.path.exists(
                standard_output_file):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with %s and it failed to generate the expected output :/ Fortunately "
                "we have this log file which should clarify the problem: '%s'. Please do not forget to include this "
                "file in your question if you were to seek help from the community."
                % (self.program_to_use, log_file))

        # Then we append the results to the main file(s)
        for output in desired_output:
            main_file_buffer = merged_files_dict[output]['buffer']
            main_file_lock = merged_files_dict[output]['lock']

            if output == 'table':
                worker_file = table_output_file
                append_function = self.append_to_main_table_file
            elif output == 'standard':
                worker_file = standard_output_file
                append_function = self.append_to_main_standard_file

            append_function(main_file_buffer, worker_file, main_file_lock)
Exemple #5
0
    def blastp(self):
        self.run.info('DIAMOND is set to be', 'Sensitive' if self.sensitive else 'Fast')

        cmd_line = ['diamond',
                    'blastp',
                    '-q', self.query_fasta,
                    '-d', self.target_fasta,
                    '-a', self.search_output_path,
                    '-t', self.tmp_dir,
                    '-p', self.num_threads]

        cmd_line.append('--sensitive') if self.sensitive else None

        if self.max_target_seqs:
            cmd_line.extend(['--max-target-seqs', self.max_target_seqs])

        if self.evalue:
            cmd_line.extend(['--evalue', self.evalue])

        self.run.info('DIAMOND blastp cmd', ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG))

        self.progress.new('DIAMOND')
        self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads)

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = self.search_output_path + '.daa'
        self.check_output(expected_output, 'blastp')

        self.run.info('Diamond blastp results', expected_output)
Exemple #6
0
    def blastp(self):
        self.progress.new('BLASTP')
        self.progress.update('running blastp (using %d thread(s)) ...' %
                             self.num_threads)

        cmd_line = [
            'blastp', '-query', self.query_fasta, '-db', self.target_db_path,
            '-evalue', self.evalue, '-outfmt', '6', '-out',
            self.search_output_path, '-num_threads', self.num_threads
        ]

        if self.max_target_seqs:
            cmd_line += ['-max_target_seqs', self.max_target_seqs]

        self.run.info('blast blastp cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        self.check_output(self.search_output_path, 'blastp')

        if self.names_dict:
            self.ununique_search_results()

        self.run.info('BLASTP results', self.search_output_path)
Exemple #7
0
    def blast(self):
        cmd_line = [
            self.search_program, '-query', self.query_fasta, '-db',
            self.target_fasta, '-evalue', self.evalue, '-outfmt', '6', '-out',
            self.search_output_path, '-num_threads', self.num_threads
        ]

        if self.max_target_seqs:
            cmd_line += ['-max_target_seqs', self.max_target_seqs]

        self.run.info('NCBI %s cmd' % self.search_program,
                      ' '.join([str(p) for p in cmd_line]),
                      quiet=(not anvio.DEBUG))

        self.progress.new('BLAST')
        self.progress.update(
            'running search (using %s with %d thread(s)) ...' %
            (self.search_program, self.num_threads))

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        self.check_output(self.search_output_path, self.search_program)

        if self.names_dict:
            self.ununique_search_results()

        self.run.info('BLAST results', self.search_output_path)
Exemple #8
0
    def hmmer_worker(self, partial_input_file, cmd_line, table_output_file, standard_output_file, desired_output, log_file,
                     output_queue, ret_value_queue):

        try:
            # First we run the command
            utils.run_command(cmd_line, log_file)

            if not os.path.exists(table_output_file) or not os.path.exists(standard_output_file):
                self.progress.end()
                raise ConfigError("Something went wrong with %s and it failed to generate the expected output :/ Fortunately "
                                  "we have this log file which should clarify the problem: '%s'. Please do not forget to include this "
                                  "file in your question if you were to seek help from the community." % (self.program_to_use, log_file))

            # Then we send the results back to the main thread to be appended to the main files
            output_dict = {}
            for output in desired_output:
                if output == 'table':
                    output_dict['table'] = table_output_file
                elif output == 'standard':
                    output_dict['standard'] = standard_output_file
            output_queue.put(output_dict)

            # return value of 0 to indicate success
            ret_value_queue.put(0)

        except Exception as e:
            # This thread encountered an error. We send the error back to the main thread which
            # will terminate the job.
            ret_value_queue.put(e)
Exemple #9
0
    def blast(self):
        cmd_line = [self.search_program,
                    '-query', self.query_fasta,
                    '-db', self.target_fasta,
                    '-evalue', self.evalue,
                    '-outfmt', '6',
                    '-out', self.search_output_path,
                    '-num_threads', self.num_threads]

        if self.max_target_seqs:
            cmd_line += ['-max_target_seqs', self.max_target_seqs]

        self.run.info('NCBI %s cmd' % self.search_program, ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG))

        self.progress.new('BLAST')
        self.progress.update('running search (using %s with %d thread(s)) ...' % (self.search_program, self.num_threads))

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        self.check_output(self.search_output_path, self.search_program)

        if self.names_dict:
            self.ununique_search_results()

        self.run.info('BLAST results', self.search_output_path)
Exemple #10
0
    def run_prodigal(self, fasta_file_path):
        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes')
        self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins')

        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Proteins', self.proteins_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')
        cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' %
                    (fasta_file_path, self.genes_in_contigs,
                     self.proteins_in_contigs, log_file_path))
        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.proteins_in_contigs):
            raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        return self.proteins_in_contigs
Exemple #11
0
    def blastp(self):
        self.run.warning(None, header="DIAMOND BLASTP", lc="green")
        self.run.info("Mode", "Sensitive" if self.sensitive else "Fast")

        cmd_line = [
            'diamond', 'blastp', '-q', self.query_fasta, '-d',
            self.target_fasta, '-o', self.tabular_output_path, '-t',
            self.tmp_dir, '-p', self.num_threads, '--outfmt',
            *self.outfmt.split()
        ]

        cmd_line.append('--sensitive') if self.sensitive else None

        if self.max_target_seqs:
            cmd_line.extend(['--max-target-seqs', self.max_target_seqs])

        if self.min_pct_id:
            cmd_line.extend(['--id', self.min_pct_id])

        if self.evalue:
            cmd_line.extend(['--evalue', self.evalue])

        self.run.info('DIAMOND blastp cmd',
                      ' '.join([str(p) for p in cmd_line]),
                      quiet=(not anvio.DEBUG))

        self.progress.new('DIAMOND')
        self.progress.update('Running blastp (using %d thread(s)) ...' %
                             self.num_threads)

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        self.run.info('Search results', self.tabular_output_path)
Exemple #12
0
    def clusterize(self, parts):
        # create a 8 digits random identifier for cluster jobs:
        identifier = ''.join(
            random.choice(string.ascii_uppercase) for x in range(10))

        for part in parts:
            command = self.command % {'binary': self.binary, 'part': part}

            # create sh file
            shell_script = part + '.sh'
            open(shell_script, 'w').write(QSUB_SCRIPT % {
                'log': part + '.log',
                'identifier': identifier,
                'command': command
            })

            # submit script to cluster
            utils.run_command('qsub %s' % shell_script)

        while True:
            qstat_info = self.get_qstat_info(identifier)
            total_processes = sum(qstat_info.values())
            if total_processes == 0:
                break

            self.progress.update(
                'Qstat Info :: Total Jobs: %s, %s' %
                (pp(total_processes), ', '.join(
                    ['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info])))

            time.sleep(5)

        return True
Exemple #13
0
    def blastp(self):
        self.run.info('DIAMOND is set to be',
                      'Sensitive' if self.sensitive else 'Fast')

        cmd_line = [
            'diamond', 'blastp', '-q', self.query_fasta, '-d',
            self.target_fasta, '-a', self.search_output_path, '-t',
            self.tmp_dir, '-p', self.num_threads
        ]

        cmd_line.append('--sensitive') if self.sensitive else None

        if self.max_target_seqs:
            cmd_line.extend(['--max-target-seqs', self.max_target_seqs])

        if self.evalue:
            cmd_line.extend(['--evalue', self.evalue])

        self.run.info('DIAMOND blastp cmd',
                      ' '.join([str(p) for p in cmd_line]),
                      quiet=(not anvio.DEBUG))

        self.progress.new('DIAMOND')
        self.progress.update('running blastp (using %d thread(s)) ...' %
                             self.num_threads)

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = self.search_output_path + '.daa'
        self.check_output(expected_output, 'blastp')

        self.run.info('Diamond blastp results', expected_output)
Exemple #14
0
    def view(self):
        self.progress.new('DIAMOND')
        self.progress.update('generating tabular output (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('diamond view -a %s -o %s -p %d -k 1000000 >> "%s" 2>&1' % (self.search_output_path + '.daa',
                                                                     self.tabular_output_path,
                                                                     self.num_threads,
                                                                     self.run.log_file_path))

        self.run.info('diamond view cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.check_output(self.tabular_output_path, 'view')

        if self.names_dict:
            self.run.info('self.names_dict is found', 'Un-uniqueing the tabular output', quiet=True)
            self.progress.update('Un-uniqueing the tabular output ...')
            # if we are here, this means the self.tabular_output_path contains information only about unique
            # entries. We will expand it here so downstream analyses do not need to pay attention to this
            # detail.
            utils.ununique_BLAST_tabular_output(self.tabular_output_path, self.names_dict)

        self.progress.end()

        self.run.info('Diamond %stabular output file' % ('un-uniqued' if len(self.names_dict) else ''), self.tabular_output_path)
Exemple #15
0
    def clusterize(self, parts):
        # create a 8 digits random identifier for cluster jobs:
        identifier = ''.join(random.choice(string.ascii_uppercase) for x in range(10))

        for part in parts:
            command = self.command % {'binary': self.binary, 'part': part}

            # create sh file
            shell_script = part + '.sh'
            open(shell_script, 'w').write(QSUB_SCRIPT % {'log': part + '.log',
                                                         'identifier': identifier,
                                                         'command': command})

            # submit script to cluster
            utils.run_command('qsub %s' % shell_script)


        while True:
            qstat_info = self.get_qstat_info(identifier)
            total_processes = sum(qstat_info.values())
            if total_processes == 0:
                break

            self.progress.update('Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes),
                       ', '.join(['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info])))

            time.sleep(5)

        return True
Exemple #16
0
    def cluster(self, input_files, args, work_dir, threads=1):
        J = lambda p: os.path.join(work_dir, p)

        bin_prefix = J('METABAT_')
        log_path = J('logs.txt')

        cmd_line = [self.program_name,
            '-i', input_files.contigs_fasta,
            '-a', input_files.contig_coverages,
            '-o', bin_prefix,
            '--cvExt',
            '-l',
            *utils.serialize_args(args)]


        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_path)
        self.progress.end()

        output_file_paths = glob.glob(J(bin_prefix + '*'))
        if not len(output_file_paths):
            raise ConfigError("Some critical output files are missing. Please take a look at the\
                               log file: %s" % (log_path))

        clusters = {}
        bin_count = 0
        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                pretty_bin_name = os.path.basename(bin_file).replace('.', '_')
                clusters[pretty_bin_name] = list(map(str.strip, f.readlines()))

        return clusters
Exemple #17
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(
            output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences',
                      self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        cmd_line = [
            'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs,
            '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta'
        ]
        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." %
                log_file_path)

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result',
                      'Prodigal (%s) has identified %d genes.' %
                      (self.installed_version, len(gene_calls_dict)),
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #18
0
    def cluster(self,
                input_files,
                args,
                work_dir,
                threads=1,
                log_file_path=None):
        J = lambda p: os.path.join(work_dir, p)

        if not log_file_path:
            log_file_path = J('logs.txt')

        translation = {
            'preference': 'p',
            'maxiter': 'm',
            'conviter': 'v',
            'damp': 'd',
            'contigsize': 'x'
        }

        cmd_line = [
            self.program_name, '-c', input_files.contig_coverages_log_norm,
            '-f',
            os.path.dirname(input_files.contigs_fasta), '-l',
            os.path.basename(input_files.contigs_fasta), '-o', work_dir,
            *utils.serialize_args(
                args, single_dash=True, translate=translation)
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_file_path)
        self.progress.end()

        output_file_paths = glob.glob(J('*.fna'))
        if not len(output_file_paths):
            raise ConfigError(
                "Some critical output files are missing. Please take a look at the "
                "log file: %s" % (log_file_path))

        clusters = {}
        bin_count = 0
        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                pretty_bin_name = os.path.basename(bin_file)
                pretty_bin_name = pretty_bin_name.replace('sequence_', '')
                pretty_bin_name = pretty_bin_name.replace('.fna', '')
                pretty_bin_name = pretty_bin_name.replace('-', '_')

                clusters[pretty_bin_name] = [
                    line.strip().replace('>', '') for line in f
                    if line.startswith('>')
                ]

        return clusters
Exemple #19
0
    def process(self, input_path, fasta_files):
        self.run.info('[sourmash] Kmer size', self.kmer_size, nl_before=1)
        self.run.info('[sourmash] Compression ratio', self.scale)

        report_name = 'kmer_%d_mash_similarity' % self.kmer_size

        # backup the old working directory before changing the directory
        old_wd = os.getcwd()
        os.chdir(input_path)
        if not os.path.exists('output'):
            os.mkdir('output')
        else:
            pass

        self.progress.new('Sourmash')
        self.progress.update('Computing fasta signatures for kmer=%d, scale=%d' % (self.kmer_size, self.scale))

        scale = '--scaled=%i' % self.scale
        compute_command = [self.program_name, 'compute',
                           '-k', self.kmer_size,
                           '-f', scale]
        compute_command.extend(fasta_files)

        exit_code = utils.run_command(compute_command, self.log_file_path, remove_log_file_if_exists=False)
        if int(exit_code):
            self.progress.end()
            raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\
                              Please check the log file `%s` for details. Offending command: \
                              `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]])))

        self.progress.update('Computing similarity matrix for kmer=%d, scale=%d' % (self.kmer_size, self.scale))
        compare_command = [self.program_name, 'compare',
                           '-k', self.kmer_size,
                           '--csv', os.path.join('output', report_name + '.txt')]
        for f in fasta_files:
            compare_command.append(f + ".sig")

        exit_code = utils.run_command(compare_command, self.log_file_path, remove_log_file_if_exists=False)
        if int(exit_code):
            self.progress.end()
            raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\
                              Please check the log file `%s` for details. Offending command: \
                              `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]])))

        self.results[report_name] = utils.get_TAB_delimited_file_as_dictionary(os.path.join('output', report_name + '.txt'),
                                                                               indexing_field=-1,
                                                                               separator=',')

        self.progress.end()

        # restore old working directory
        os.chdir(old_wd)

        return self.results
Exemple #20
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta']
        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError("Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." % log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #21
0
    def check_database(self):
        """
        Checks for the .bin version of database. If it only finds the .pir version, it binarizes it.
        Sets the db filepath.
        """
        extensionless, extension = os.path.splitext(self.modeller_database)
        if extension not in [".bin", ".pir", ""]:
            raise ConfigError(
                "MODELLER :: The only possible database extensions are .bin and .pir"
            )

        bin_db_path = J(self.database_dir, extensionless + ".bin")
        pir_db_path = J(self.database_dir, extensionless + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        self.database_path = bin_db_path

        if bin_exists:
            return

        if not pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning(
                "Anvi'o looked in {} for a database with the name {} and with an extension \
                              of either .bin or .pir, but didn't find anything matching that \
                              criteria. Anvi'o will try and download the best database it knows of from \
                              https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \
                              You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \
                              database".format(self.database_dir,
                                               self.modeller_database))

            db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz")
            utils.download_file(
                "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                db_download_path)
            utils.run_command(['gzip', '-d', db_download_path],
                              log_file_path=filesnpaths.get_temp_file_path())

            pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                          dont_raise=True)

        if pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower \
                              than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)
            return
Exemple #22
0
    def dry_run(self, workflow_graph_output_file_path_prefix='workflow'):
        """Not your regular dry run.

           The purpose of this function is to make sure there is a way to check for
           workflow program dependencies before the workflow is actually run. this way,
           if there is a `check_workflow_program_dependencies` call at the end of the
           snake file `get_workflow_snake_file_path(self.name)`, it can be called with
           a compiled snakemake `workflow` instance."""

        if self.slave_mode:
            return

        self.progress.new('Bleep bloop')
        self.progress.update('Quick dry run for an initial sanity check ...')
        args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \
                '--configfile', self.config_file, '--dryrun', '--quiet']

        if self.save_workflow_graph:
            args.extend(['--dag'])

        log_file_path = filesnpaths.get_temp_file_path()
        u.run_command(args, log_file_path)
        self.progress.end()

        # here we're getting the graph info from the log file like a dirty hacker
        # we are (it still may be better to do it elsewhere more appropriate .. so
        # we can look more decent or whatever):
        if self.save_workflow_graph:
            lines = open(log_file_path, 'rU').readlines()

            try:
                line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0]
            except IndexError:
                raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have\
                                   gone wrong in a step prior. Something tells anvi'o that if you take a look at the\
                                   log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path)
            open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:]))

            self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot')

            if u.is_program_exists('dot', dont_raise=True):
                dot_log_file = filesnpaths.get_temp_file_path()
                u.run_command(['dot', '-Tpng', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.png'], dot_log_file)
                os.remove(dot_log_file)
                self.run.info('Workflow PNG file', workflow_graph_output_file_path_prefix + '.png')
            else:
                self.run.warning("Well, anvi'o was going to try to save a nice PNG file for your workflow\
                                  graph, but clearly you don't have `dot` installed on your system. That's OK. You\
                                  have your dot file now, and you can Google 'how to view dot file on [your operating\
                                  system goes here]', and install necessary programs (like .. `dot`).")

        os.remove(log_file_path)
Exemple #23
0
    def dry_run(self, workflow_graph_output_file_path_prefix='workflow'):
        """Not your regular dry run.

           The purpose of this function is to make sure there is a way to check for
           workflow program dependencies before the workflow is actually run. this way,
           if there is a `check_workflow_program_dependencies` call at the end of the
           snake file `get_workflow_snake_file_path(self.name)`, it can be called with
           a compiled snakemake `workflow` instance."""

        if self.this_workflow_is_inherited_by_another:
            return

        self.progress.new('Bleep bloop')
        self.progress.update('Quick dry run for an initial sanity check ...')
        args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \
                '--configfile', self.config_file, '--dryrun', '--quiet']

        if self.save_workflow_graph:
            args.extend(['--dag'])

        log_file_path = filesnpaths.get_temp_file_path()
        u.run_command(args, log_file_path)
        self.progress.end()

        # here we're getting the graph info from the log file like a dirty hacker
        # we are (it still may be better to do it elsewhere more appropriate .. so
        # we can look more decent or whatever):
        if self.save_workflow_graph:
            lines = open(log_file_path, 'rU').readlines()

            try:
                line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0]
            except IndexError:
                raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have "
                                  "gone wrong in a step prior. Something tells anvi'o that if you take a look at the "
                                  "log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path)
            open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:]))

            self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot')

            if u.is_program_exists('dot', dont_raise=True):
                dot_log_file = filesnpaths.get_temp_file_path()
                u.run_command(['dot', '-Tpdf', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.pdf'], dot_log_file)
                os.remove(dot_log_file)
                self.run.info('Workflow PDF file', workflow_graph_output_file_path_prefix + '.pdf')
            else:
                self.run.warning("Well, anvi'o was going to try to save a nice PDF file for your workflow "
                                 "graph, but clearly you don't have `dot` installed on your system. That's OK. You "
                                 "have your dot file now, and you can Google 'how to view dot file on [your operating "
                                 "system goes here]', and install necessary programs (like .. `dot`).")

        os.remove(log_file_path)
Exemple #24
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and protein sequences dict.
        """
        gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        protein_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.proteins_in_contigs = os.path.join(output_dir, 'contigs.proteins')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header = 'Finding ORFs in contigs', lc = 'green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Proteins', self.proteins_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')
        cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path,
                                                                               self.genes_in_contigs,
                                                                               self.proteins_in_contigs,
                                                                               log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.proteins_in_contigs):
            self.progress.end()
            raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.proteins_in_contigs)

        hit_id = 0
        while fasta.next():
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            protein_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_prodigal_version, len(gene_calls_dict)), nl_after = 1)

        return gene_calls_dict, protein_sequences_dict
Exemple #25
0
    def view(self):
        self.run.warning(None, header="DIAMOND VIEW", lc="green")
        self.progress.new('DIAMOND')
        self.progress.update(
            'generating tabular output (using %d thread(s)) ...' %
            self.num_threads)

        cmd_line = [
            'diamond', 'view', '-a', self.search_output_path + '.daa', '-o',
            self.tabular_output_path, '-p', self.num_threads, '--outfmt',
            *self.outfmt.split()
        ]

        self.run.info('Command line',
                      ' '.join([str(x) for x in cmd_line]),
                      quiet=True)

        utils.run_command(cmd_line, self.run.log_file_path)

        self.check_output(self.tabular_output_path, 'view')

        if self.names_dict:
            # if we are here, this means the self.tabular_output_path contains information only about unique
            # entries. We will expand it here so downstream analyses do not need to pay attention to this
            # detail.
            self.run.info('self.names_dict is found',
                          'Un-uniqueing the tabular output',
                          quiet=True)
            self.progress.update('Un-uniqueing the tabular output ...')

            try:
                int(self.outfmt)
            except:
                if not self.outfmt.startswith("6 qseqid sseqid"):
                    self.progress.end()
                    raise ConfigError(
                        "drivers.diamond :: You can't supply a names_dict when running "
                        "view(...) unless your outfmt starts with '6 qseqid sseqid'. Update "
                        "utils.ununique_BLAST_tabular_output to fix this problem. If you're a "
                        "user, please report this on github.")

            utils.ununique_BLAST_tabular_output(self.tabular_output_path,
                                                self.names_dict)

        self.progress.end()

        self.run.info(
            'Diamond %s tabular output file' %
            ('un-uniqued' if self.names_dict else ''),
            self.tabular_output_path)
Exemple #26
0
    def makedb(self):
        self.progress.new('BLAST')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('makeblastdb -in %s -dbtype prot -out %s >> "%s" 2>&1' % (self.query_fasta,
                                                                         self.target_db_path,
                                                                         self.run.log_file_path))

        self.run.info('blast makeblast cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        expected_output = self.target_db_path + '.phr'
        self.check_output(expected_output, 'makeblastdb')
Exemple #27
0
    def makedb(self):
        self.progress.new('BLAST')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('makeblastdb -in %s -dbtype prot -out %s' % (self.query_fasta,
                                                                 self.target_db_path))

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = self.target_db_path + '.phr'
        self.check_output(expected_output, 'makeblastdb')

        self.run.info('blast makeblast cmd', cmd_line, quiet=True)
        self.run.info('BLAST search db', self.target_db_path)
Exemple #28
0
    def cluster(self,
                input_files,
                args,
                work_dir,
                threads=1,
                log_file_path=None):
        J = lambda p: os.path.join(work_dir, p)

        output_file_prefix = J('MAXBIN_')

        if not log_file_path:
            log_file_path = J('logs.txt')

        cmd_line = [
            self.program_name, '-contig', input_files.contigs_fasta, '-abund',
            input_files.contig_coverages, '-out', output_file_prefix,
            '-thread',
            str(threads),
            *utils.serialize_args(args, single_dash=True, use_underscore=True)
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_file_path)
        self.progress.end()

        output_file_paths = glob.glob(J(output_file_prefix + '*.fasta'))
        if not len(output_file_paths):
            raise ConfigError(
                "Some critical output files are missing. Please take a look at the "
                "log file: %s" % (log_file_path))

        clusters = {}
        bin_count = 0

        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                bin_name = os.path.basename(bin_file).replace('.fasta', '')
                bin_name = bin_name.replace('.', '_')

                clusters[bin_name] = []

                for line in f.readlines():
                    if line.startswith('>'):
                        clusters[bin_name].append(line[1:].strip())

        return clusters
Exemple #29
0
    def hmmpress_sources(self, sources, tmp_dir):
        """This function runs hmmpress on the hmm profiles.

        It returns the locations of each hmmpressed file path in a dictionary keyed by the source.
        """
        hmmpressed_file_paths = {}
        for source in sources:
            model_file = sources[source]['model']
            hmm_file_path = os.path.join(tmp_dir, source + '.hmm')
            hmm_file = open(hmm_file_path, 'wb')
            hmm_file.write(gzip.open(model_file, 'rb').read())
            hmm_file.close()

            log_file_path = log_file_path = os.path.join(
                tmp_dir, 'hmmpress.log')
            cmd_line = ['hmmpress', hmm_file_path]
            ret_val = utils.run_command(cmd_line, log_file_path)

            hmmpressed_file_paths[source] = hmm_file_path

            if ret_val:
                raise ConfigError(
                    "Sadly, anvi'o failed while attempting to compress the HMM model for source %s. You can check out the log file (%s) for "
                    "more detailed information on why this happened." %
                    (source, log_file_path))
        return hmmpressed_file_paths
Exemple #30
0
    def _command_runner(command, log_file_path):
        """Run `command`, writing any logs to `log_file_path`.

        If the command returns a zero exit code, _command_runner returns 0, otherwise it returns `CommandError`.

        Feel free to override this function.  However, if you do, I would suggest that you return CommandError rather
        than raising the error.  This is because _command_runner will be run in its own Thread.  Any Exception that
        is raised in a Thread (or AnviThread) will not bubble up into the calling context.  What that means is that
        if you get an error in your job that's running in the thread, it will just crash the thread but the calling
        context will happily go on with it's work.  This is often NOT what you want.  So to handle errors in the
        calling context, you need to return something like CommandError, and then handle that.  See the
        implementation for _run_commands for an example of how to do this properly.
        """
        try:
            return_value = utils.run_command(command, log_file_path)
        except ConfigError as e:
            # utils.run_command can raise ConfigError.  So pass the message from that to a CommandError to keep it
            # consistent.
            return CommandError(e.e)

        if return_value < 0 or return_value > 0:
            # Technically, utils.run_command will return ConfigError if the return code was < 0, but just do this
            # sanity check here as well to be sure.
            return CommandError(
                f"Failed to run '{command}'.  Exit code: {return_value}")
        else:
            return return_value
Exemple #31
0
    def view(self):
        self.progress.new('DIAMOND')
        self.progress.update('generating tabular output (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('diamond view -a %s -o %s -p %d >> "%s" 2>&1' % (self.search_output_path + '.daa',
                                                                     self.tabular_output_path,
                                                                     self.num_threads,
                                                                     self.log_file_path))
        with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.tabular_output_path, 'view')

        self.run.info('Diamond tabular output file', self.tabular_output_path)
Exemple #32
0
    def makedb(self, output_db_path=None):
        self.progress.new('BLAST')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)

        cmd_line = ['makeblastdb',
                    '-in', self.target_fasta,
                    '-dbtype', 'prot',
                    '-out', output_db_path or self.target_fasta]

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = (output_db_path or self.target_fasta) + '.phr'
        self.check_output(expected_output, 'makeblastdb')

        self.run.info('blast makeblast cmd', cmd_line, quiet=True)
        self.run.info('BLAST search db', self.target_fasta)
Exemple #33
0
    def cluster(self):
        self.progress.new('MCL')
        self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path,
                                                                         self.inflation,
                                                                         self.clusters_file_path,
                                                                         self.num_threads,
                                                                         self.log_file_path))

        with open(self.log_file_path, "a") as log: log.write('MCL CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.clusters_file_path, 'makedb')

        self.run.info('MCL output', self.clusters_file_path)
Exemple #34
0
    def makedb(self):
        self.progress.new('DIAMOND')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('diamond makedb --in %s -d %s -p %d >> "%s" 2>&1' % (self.query_fasta,
                                                                         self.target_db_path,
                                                                         self.num_threads,
                                                                         self.run.log_file_path))

        self.run.info('diamond makedb cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        expected_output = self.target_db_path + '.dmnd'
        self.check_output(expected_output, 'makedb')

        self.run.info('Diamond temp search db', expected_output)
Exemple #35
0
    def makedb(self, output_file_path=None):
        self.progress.new('DIAMOND')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)

        cmd_line = ['diamond',
                    'makedb',
                    '--in', self.query_fasta,
                    '-d', output_file_path or self.target_fasta,
                    '-p', self.num_threads]

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = (output_file_path or self.target_fasta) + '.dmnd'
        self.check_output(expected_output, 'makedb')

        self.run.info('diamond makedb cmd', ' '.join([str(x) for x in cmd_line]), quiet=True)
        self.run.info('Diamond search db', expected_output)
Exemple #36
0
    def cluster(self):
        self.progress.new('MCL')
        self.progress.update('clustering (using %d thread(s)) ...' %
                             self.num_threads)
        cmd_line = (
            'mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' %
            (self.mcl_input_file_path, self.inflation, self.clusters_file_path,
             self.num_threads, self.log_file_path))

        with open(self.log_file_path, "a") as log:
            log.write('MCL CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.clusters_file_path, 'makedb')

        self.run.info('MCL output', self.clusters_file_path)
Exemple #37
0
    def view(self):
        self.progress.new('DIAMOND')
        self.progress.update(
            'generating tabular output (using %d thread(s)) ...' %
            self.num_threads)
        cmd_line = (
            'diamond view -a %s -o %s -p %d >> "%s" 2>&1' %
            (self.search_output_path + '.daa', self.tabular_output_path,
             self.num_threads, self.log_file_path))
        with open(self.log_file_path, "a") as log:
            log.write('CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.tabular_output_path, 'view')

        self.run.info('Diamond tabular output file', self.tabular_output_path)
Exemple #38
0
    def blastp(self):
        self.progress.new('DIAMOND')
        self.progress.update('running blastp (using %d thread(s)) ...' %
                             self.num_threads)
        cmd_line = (
            'diamond blastp -q %s -d %s -a %s -t %s -p %d >> "%s" 2>&1' %
            (self.query_fasta, self.target_db_path, self.search_output_path,
             self.tmp_dir, self.num_threads, self.log_file_path))
        with open(self.log_file_path, "a") as log:
            log.write('CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        expected_output = self.search_output_path + '.daa'
        self.check_output(expected_output, 'blastp')

        self.run.info('Diamond blastp results', expected_output)
Exemple #39
0
    def makedb(self, output_file_path=None):
        self.progress.new('DIAMOND')
        self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads)

        # NOTE Question from Evan. Why is the query_fasta the input to the database?
        cmd_line = ['diamond',
                    'makedb',
                    '--in', self.query_fasta,
                    '-d', output_file_path or self.target_fasta,
                    '-p', self.num_threads]

        utils.run_command(cmd_line, self.run.log_file_path)

        self.progress.end()

        expected_output = (output_file_path or self.target_fasta) + '.dmnd'

        self.run.info('diamond makedb cmd', ' '.join([str(x) for x in cmd_line]), quiet=True)
        self.run.info('Diamond search db', expected_output)
Exemple #40
0
    def blastp(self):
        self.progress.new('BLASTP')
        self.progress.update('running blastp (using %d thread(s)) ...' %
                             self.num_threads)
        cmd_line = (
            'blastp -query %s -db %s -evalue %f -outfmt 6 -out %s -num_threads %d >> "%s" 2>&1'
            % (self.query_fasta, self.target_db_path, self.evalue,
               self.search_output_path, self.num_threads,
               self.run.log_file_path))

        self.run.info('blast blastp cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.search_output_path, 'blastp')

        self.run.info('BLASTP results', self.search_output_path)
Exemple #41
0
    def blastp(self):
        self.progress.new('BLASTP')
        self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('blastp -query %s -db %s -evalue %f -outfmt 6 -out %s -num_threads %d >> "%s" 2>&1' % (self.query_fasta,
                                                                                                           self.target_db_path,
                                                                                                           self.evalue,
                                                                                                           self.search_output_path,
                                                                                                           self.num_threads,
                                                                                                           self.run.log_file_path))

        self.run.info('blast blastp cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.search_output_path, 'blastp')

        self.run.info('BLASTP results', self.search_output_path)
Exemple #42
0
    def blastp(self):
        self.progress.new('DIAMOND')
        self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('diamond blastp -q %s -d %s -a %s -t %s -p %d >> "%s" 2>&1' % (self.query_fasta,
                                                                                   self.target_db_path,
                                                                                   self.search_output_path,
                                                                                   self.tmp_dir,
                                                                                   self.num_threads,
                                                                                   self.log_file_path))
        with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n')

        utils.run_command(cmd_line)

        self.progress.end()

        expected_output = self.search_output_path + '.daa'
        self.check_output(expected_output, 'blastp')

        self.run.info('Diamond blastp results', expected_output)
Exemple #43
0
    def run_command(self, query_targets, reference_targets, output_path, run_dir=os.getcwd(), name_conversion_dict=None):
        """ Run the command

        Parameters
        ==========
        query_targets : string or Path-like
            The query set of genomes (--ql). It should be a list of filepaths, one per line
        reference_targets : string or Path-like
            The reference set of genomes (--rl). It should be a list of filepaths, one per line
        output_path : string or Path-like
            Where should the raw fastANI output file be created? Relative to current working
            directory, not `run_dir`
        run_dir : string or Path-like, os.getcwd()
            Where should the command be run? The current directory is the default
        name_conversion_dict : dict, None
            The keys of `results` are by default the file paths of the genomes, since that's what
            fastANI outputs. Pass an optional dictionary with <path>:<name> to convert the output.
            Note: this effects both the raw output in `output_path` and `results`

        Returns
        =======
        results : dictionary
            results dictionary
        """

        self.add_run_info()

        self.query_names, self.reference_names = self.get_all_query_and_reference_names(query_targets, reference_targets)

        command = [self.program_name,
                   '--ql', query_targets,
                   '--rl', reference_targets,
                   '-k', self.kmer_size,
                   '--fragLen', self.fragment_length,
                   '--minFrag', self.min_num_fragments,
                   '-t', self.num_threads,
                   '-o', output_path]

        self.progress.new('fastANI')
        self.progress.update('Many to Many ...')

        with utils.RunInDirectory(run_dir):
            exit_code = utils.run_command(command, self.log_file_path)

        self.progress.end()

        if int(exit_code):
            raise ConfigError("fastANI returned with non-zero exit code, there may be some errors. \
                    please check the log file for details.")

        self.fastANI_output = self.load_output_as_dataframe(output_path, name_conversion_dict)
        utils.store_dataframe_as_TAB_delimited_file(self.fastANI_output, output_path)

        self.results = self.gen_results_dict()
        return self.results
Exemple #44
0
    def cluster(self):
        self.run.info('MCL inflation', self.inflation)

        self.progress.new('MCL')
        self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path,
                                                                         self.inflation,
                                                                         self.clusters_file_path,
                                                                         self.num_threads,
                                                                         self.run.log_file_path))

        self.run.info('mcl cmd', cmd_line, quiet = True)

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.clusters_file_path, 'makedb')

        self.run.info('MCL output', self.clusters_file_path)
Exemple #45
0
    def cluster(self):
        self.run.info('MCL inflation', self.inflation)

        self.progress.new('MCL')
        self.progress.update('clustering (using %d thread(s)) ...' %
                             self.num_threads)
        cmd_line = (
            'mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' %
            (self.mcl_input_file_path, self.inflation, self.clusters_file_path,
             self.num_threads, self.run.log_file_path))

        self.run.info('mcl cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        self.check_output(self.clusters_file_path, 'makedb')

        self.run.info('MCL output', self.clusters_file_path)
Exemple #46
0
    def check_database(self):
        """
        Checks for the .bin version of database. If it only finds the .pir version, it binarizes it.
        Sets the db filepath.
        """
        extensionless, extension = os.path.splitext(self.modeller_database)
        if extension not in [".bin",".pir",""]:
            raise ConfigError("MODELLER :: The only possible database extensions are .bin and .pir")

        bin_db_path = J(self.database_dir, extensionless+".bin")
        pir_db_path = J(self.database_dir, extensionless+".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True)

        self.database_path = bin_db_path

        if bin_exists:
            return

        if not pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning("Anvi'o looked in {} for a database with the name {} and with an extension \
                              of either .bin or .pir, but didn't find anything matching that \
                              criteria. We'll try and download the best database we know of from \
                              https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \
                              You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \
                              database".format(self.database_dir, self.modeller_database))

            db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz")
            utils.download_file("https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path)
            utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path())

            pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True)

        if pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning("Your database is not in binary format. That means accessing its contents is slower \
                              than it could be. Anvi'o is going to make a binary format. Just FYI")
            self.run_binarize_database(pir_db_path, bin_db_path)
            return
Exemple #47
0
    def blastp(self):
        self.run.info('DIAMOND is set to be', 'Sensitive' if self.sensitive else 'Fast')

        self.progress.new('DIAMOND')
        self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads)
        cmd_line = ('diamond blastp -q %s -d %s -a %s -t %s -p %d %s -k 1000000 >> "%s" 2>&1' % (self.query_fasta,
                                                                                   self.target_db_path,
                                                                                   self.search_output_path,
                                                                                   self.tmp_dir,
                                                                                   self.num_threads,
                                                                                   '--sensitive' if self.sensitive else '',
                                                                                   self.run.log_file_path))

        self.run.info('diamond blastp cmd', cmd_line, quiet=True)

        utils.run_command(cmd_line)

        self.progress.end()

        expected_output = self.search_output_path + '.daa'
        self.check_output(expected_output, 'blastp')

        self.run.info('Diamond blastp results', expected_output)
Exemple #48
0
    def run_command(self, input_path):
        # backup the old working directory before changing the directory
        old_wd = os.getcwd()
        os.chdir(input_path)

        full_command = [self.program_name,
                        '--outdir', 'output',
                        '--indir', input_path,
                        '--method', self.method,
                        '--workers', self.num_threads]

        self.progress.new('PyANI')
        self.progress.update('Running ...')
        exit_code = utils.run_command(full_command, self.log_file_path)
        self.progress.end()

        if int(exit_code):
            raise ConfigError("PyANI returned with non-zero exit code, there may be some errors. \
                              please check the log file for details.")

        output_matrix_names = ['alignment_coverage', 'alignment_lengths', 'hadamard', \
                              'percentage_identity', 'similarity_errors', 'correlations']

        full_matrix_path = lambda name: os.path.join(input_path, 'output', self.method + '_' + name + '.tab')

        matrices = {}
        for matrix_name in output_matrix_names:
            output_matrix_path = full_matrix_path(matrix_name)
            if os.path.exists(output_matrix_path):
                matrices[matrix_name] = utils.get_TAB_delimited_file_as_dictionary(output_matrix_path, empty_header_columns_are_OK=True)

        if not len(matrices):
            raise ConfigError("None of the output matrices pyANI was supposed to generate was found in the\
                               output directory :( You may find some clues in the log file?")

        self.run.info_single("Output matrices for the following items are stored in the output\
                              directory: %s <success kid meme.png>." % \
                                        (', '.join(["'%s'" % m.replace('_', ' ') for m in matrices])), nl_before=1, mc='green')

        # restore old working directory
        os.chdir(old_wd)

        return matrices
Exemple #49
0
    def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target)

        self.run.warning('', header='HMM Profiling for %s' % source, lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\
                               installed is either not up-to-date enough, or too new :/ Just to make sure what went\
                               wrong please take a look at the log file ('%s'). Please visit %s to see what\
                               is the latest version availalbe if you think updating HMMER can resolve it. You can\
                               learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan',
                    '-o', self.hmm_scan_output, *noise_cutoff_terms.split(),
                    '--cpu', self.num_threads_to_use,
                    '--tblout', self.hmm_scan_hits_shitty,
                    hmm_file_path, self.target_files_dict[target]]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            self.progress.end()
            raise ConfigError("Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        
        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue
            
                parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        
        parseable_output.close()

        if detected_non_ascii:
            self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \
                the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\
                You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % 
                (self.hmm_scan_hits_shitty, ", ".join(map(str, lines_with_non_ascii))))

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Exemple #50
0
    def process(self, output_dir, drop_previous_annotations=False):
        """Takes an anvi'o contigs database, and does its magic.

        Which involves exporting amino acid sequences for gene calls, running emapper.py on them,\
        parsing the output, and storing the results in the contigs database.
        """

        if not self.contigs_db_path:
            raise ConfigError("EggNOGMapper::process() is speaking: you can't really call this function if you inherited\
                                this class without a contigs database path :/ What are you doing?")

        filesnpaths.is_output_dir_writable(output_dir)

        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        if not contigs_db.meta['genes_are_called']:
            raise ConfigError("It seems genes were not called for this contigs database (%s). This is a\
                                total no-no since we will need them to get amino acid seqeunces for functional\
                                annotationd :/" % self.contigs_db_path)

        aa_sequences_list = contigs_db.db.get_table_as_list_of_tuples(t.gene_amino_acid_sequences_table_name)
        num_aa_sequences = len(aa_sequences_list)
        contigs_db.disconnect()

        # change the current work directory
        work_dir = os.getcwd()
        os.chdir(output_dir)

        self.run.info('Work directory for temporary files', output_dir)
        self.run.info('Num threads to use', self.num_threads)
        self.run.info('Target database', self.database, mc='red')
        self.run.info('Use memomory', self.usemem)
        self.run.info('Genes found', num_aa_sequences, mc='green')
        self.run.info('AA sequences', self.aa_sequences_file_name)

        self.progress.new('Processing')
        self.progress.update('Storing gene sequences ...')

        aa_sequences_fp = open(self.aa_sequences_file_name, 'w')
        for gene_callers_id, aa_sequence in aa_sequences_list:
            aa_sequences_fp.write('>%s%d\n%s\n' % (self.gene_caller_id_prefix, gene_callers_id, aa_sequence))
        aa_sequences_fp.close()
        del aa_sequences_list

        cmd_line = [self.executable, '-i', self.aa_sequences_file_name, '--output', self.output_file_prefix]

        # num threads
        cmd_line.extend(['--cpu', self.num_threads]) if self.num_threads else None

        # usemem
        cmd_line.extend(['--usemem']) if self.usemem else None

        # database
        cmd_line.extend(['--database', self.database])

        self.progress.update('Running eggnog-mapper on %d sequences. This may take a while ...' % num_aa_sequences)
        utils.run_command(cmd_line, self.log_file_path)

        if not os.path.exists(self.annotations_file_name):
            self.progress.end()
            raise ConfigError("Something went wrong with eggnog-mapper :( The annotations file is not where it is supposed to be.\
                                If you are lucky, this log file will have enough output information for you to make sense of\
                                what went wrong: '%s'. Due to this error, the output directory will be kept as is, and you\
                                will have to remove it manually. Sorry about the inconvenience! Anvi'o developers know how much\
                                it sucks when things just don't work." % os.path.join(output_dir, self.log_file_path))

        self.progress.end()

        # we are done, and the annotations file is there.
        self.populate_annotations_dict(os.path.join(output_dir, self.annotations_file_name))
        os.chdir(work_dir)

        # alright. store annotations into the database
        self.store_annotations_in_db(drop_previous_annotations=drop_previous_annotations)
Exemple #51
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs]

        if self.prodigal_translation_table:
            cmd_line.extend(['-g', self.prodigal_translation_table])
            self.run.warning("Prodigal translation table is set to '%s' (whatever you did has worked so far, but\
                              keep an eye for errors from prodigal in case it doesn't like your translation table\
                              parameter). This means we will not use prodigal in the metagenomics mode, due to this\
                              issue: https://github.com/hyattpd/Prodigal/issues/19. If that issue is closed, and you\
                              are reading this message, then please contact an anvi'o developer." % str(self.prodigal_translation_table))
        else:
            cmd_line.extend(['-p', 'meta'])

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError("Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." % log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #52
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')
        cmd_line = ('hmmscan -o "%s" %s --tblout "%s" "%s" "%s" >> "%s" 2>&1' % (self.hmm_scan_output,
                                                                              cut_off_flag,
                                                                              self.hmm_scan_hits_shitty,
                                                                              hmm_file_path,
                                                                              self.proteins_in_contigs,
                                                                              log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Exemple #53
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        ret_val = utils.run_command(cmd_line)
        if ret_val:
            raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html')
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \
                                        % (self.hmm_scan_output,
                                           cut_off_flag,
                                           self.num_threads_to_use,
                                           self.hmm_scan_hits_shitty,
                                           hmm_file_path,
                                           self.protein_sequences_fasta,
                                           log_file_path))

        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None