Beispiel #1
0
    def parse_output(self):
        """Parses the tRNAScan-SE output file

        Parameters
        ==========
        N\A

        Returns
        =======
        results: `dict`
            A Dictionary of hits
        """

        num_lines = filesnpaths.get_num_lines_in_file(self.output_file_path)

        if not num_lines:
            self.run.warning("No tRNA genes found in tRNAScan-SE output.")
            return {}

        d = {}

        self.progress.new("Parsing the output ...")
        with open(self.output_file_path) as output:
            # first three lines are garbage
            for i in range(0, 3):
                output.readline()

            entry_no = 0
            while 1:
                self.progress.update(entry_no)
                line = output.readline().strip('\n')

                if not line:
                    break

                entry_no += 1

                fields = [f.strip() for f in line.split('\t')]

                if not len(fields) == 10:
                    raise ConfigError("The expected output of tRNAScan-SE includes exactly 10 columns. However, the output\
                                       anvi'o is working contains at least one line with %d columns :/ This doesn't look\
                                       good. Here is the list of columns data of that line for your reference: '%s'." \
                                                            % (len(fields), fields))

                d[entry_no] = {
                    'contig': fields[0],
                    'trna_no': fields[1],
                    'start': int(fields[2]),
                    'stop': int(fields[3]),
                    'amino_acid': fields[4],
                    'codon': fields[5],
                    'score': float(fields[8])
                }

        self.progress.end()

        self.run.info("Num tRNA genes parsed", entry_no)

        return d
Beispiel #2
0
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path)

        def raise_error(line_num, line_content, fields, e):
            raise ConfigError(f"Bad news :( While parsing a COG input file, anvi'o encountered an error (which said: [{e}]) "
                              f"while processing the line {line_counter} in your file. Where the fields in that file looked "
                              f"looked like this: {fields}. Sadly, this has been a long-standing and very annoying issue that "
                              f"anvi'o developers were unable to reproduce. But we recently learned that the issue is likely due "
                              f"to your internet speed (https://github.com/merenlab/anvio/issues/1738). Slower connections lead "
                              f"to broken connections with the NCBI servers, and leave you with an unfinished file :/ The only "
                              f"working solution so far is to try again with a faster internet connection.")

        progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file)

        p_id_to_cog_id = {}

        line_counter = 0
        for line in open(input_file_path, 'rU').readlines():
            line_counter += 1

            if line_counter % 500 == 0:
                self.progress.increment(line_counter)
                progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%")

            fields = line.strip('\n').split(',')

            # `p_id` should look just like the FASTA ids, and its location has changed between
            # 2014 release and 2020 release.
            if self.COG_version == 'COG14':
                try:
                    p_id = fields[0]
                    COG = fields[6]
                except Exception as e:
                    raise_error(line_counter, line, fields, e)
            elif self.COG_version == 'COG20':
                try:
                    p_id = fields[2].replace('.', '_')
                    COG = fields[6]
                except Exception as e:
                    raise_error(line_counter, line, fields, e)
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                if COG not in p_id_to_cog_id[p_id]:
                    p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).")
        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()
Beispiel #3
0
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path)

        progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file)

        p_id_to_cog_id = {}

        line_counter = 0
        for line in open(input_file_path, 'rU').readlines():
            line_counter += 1

            if line_counter % 500 == 0:
                self.progress.increment(line_counter)
                progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%")

            fields = line.strip('\n').split(',')

            # `p_id` should look just like the FASTA ids, and its location has changed between
            # 2014 release and 2020 release.
            if self.COG_version == 'COG14':
                p_id = fields[0]
                COG = fields[6]
            elif self.COG_version == 'COG20':
                p_id = fields[2].replace('.', '_')
                COG = fields[6]
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                if COG not in p_id_to_cog_id[p_id]:
                    p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).")
        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()
Beispiel #4
0
    def run_hmmscan(self, source, alphabet, context, kind, domain,
                    num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\
                               installed is either not up-to-date enough, or too new :/ Just to make sure what went\
                               wrong please take a look at the log file ('%s'). Please visit %s to see what\
                               is the latest version availalbe if you think updating HMMER can resolve it. You can\
                               learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = [
            'nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o',
            self.hmm_scan_output, *noise_cutoff_terms.split(), '--cpu',
            self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty,
            hmm_file_path, self.target_files_dict[target]
        ]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." %
                log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')

        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue

                parseable_output.write('\t'.join(line.split()[0:18]) + '\n')

        parseable_output.close()

        if detected_non_ascii:
            self.run.warning(
                "Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \
                the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\
                You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"."
                % (self.hmm_scan_hits_shitty, ", ".join(
                    map(str, lines_with_non_ascii))))

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Beispiel #5
0
    def run_hmmer(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms,
                  desired_output='table', out_fmt='--tblout'):
        """Run the program

        Parameters
        ==========
        source : str
            A name for your HMM effort.

        alphabet : str
            Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'}

        context : str
            This will determine how your output is processed. FIXME Documentation is lacking. Choose
            from {'GENE', 'CONTIG', 'DOMAIN'}.

        kind : str
            Used for user stdout info. Don't by afraid to pass None

        domain : str
            Used for user stdout info. Don't by afraid to pass None

        num_genes_in_model : int
            Used for user stdout info. Don't by afraid to pass None

        hmm : str
            Path to the input .hmm file

        ref : int
            Used for user stdout info. Don't by afraid to pass None

        noise_cutoff_terms : str
            Filter out hits with built-in flags. e.g. '--cut_ga'

        desired_output : str OR list, 'table'
            HMMER programs have a couple of outputs. For the standard output (specified by the hmmer
            program flag `-o`), pass 'standard'. For the tabular output (specified by the hmmer
            program flag `--tblout` or `--domtblout`), pass 'table'. If you want to use both, pass
            ('standard', 'table')

        out_fmt : str, '--tblout'
            HMMer programs have different table output formats. For example, choose from --tblout or
            --domtblout.
        """

        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context "
                               "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                               "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                               "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target)

        if isinstance(desired_output, str):
            desired_output = (desired_output, )

        for output in desired_output:
            if output not in ['standard', 'table']:
                raise ConfigError("HMMer.run_hmmer :: Unknown desired_output, '%s'" % output)

        if out_fmt not in ['--tblout', '--domtblout']:
            raise ConfigError("HMMer.run_hmmer :: Unknown out_fmt, '%s'" % out_fmt)

        self.run.warning('', header='HMM Profiling for %s' % source, lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N/A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes in HMM model', num_genes_in_model or 'unknown')
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)
        if alphabet in ['DNA', 'RNA']:
            self.run.info('HMMer program used for search', 'nhmmscan')
        else:
            self.run.info('HMMer program used for search', self.program_to_use)

        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        self.run.info('Temporary work dir', tmp_dir)

        # check if all hmmpress files are in the HMM directory
        self.verify_hmmpress_output(hmm)

        workers = []
        manager = multiprocessing.Manager() # this dude holds the shared objects that will be modified by workers
        ret_value_queue = manager.Queue(maxsize=self.num_threads_to_use)
        output_queue = manager.Queue()

        # Holds buffer and write lock for each output
        merged_files_dict = {}
        for output in desired_output:
            merged_files_dict[output] = {'buffer': io.StringIO(), 'lock': manager.Lock()}

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning(f"You requested {P('core', self.num_threads_to_use)} but there were only {P('sequence', num_parts)} "
                             f"in the FASTA file for the target '{target}'. Anvi'o will use {P('process', num_parts, sfp='es')} "
                             f"with {P('core', cores_per_process)} instead. And that's that.")
            self.num_threads_to_use = num_parts

        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
            self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
                             "We hope that is alright." % (self.program_to_use, alphabet))

        thread_num = 0
        for partial_input_file in self.target_files_dict[target]:
            log_file = partial_input_file + '_log'
            output_file = partial_input_file + '_output'
            table_file = partial_input_file + '_table'

            self.run.info('Log file for thread %s' % thread_num, log_file)
            thread_num += 1

            if noise_cutoff_terms:
                cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                            '-o', output_file, *noise_cutoff_terms.split(),
                            '--cpu', cores_per_process,
                            out_fmt, table_file,
                            hmm, partial_input_file]
            else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line
                cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                            '-o', output_file,
                            '--cpu', cores_per_process,
                            out_fmt, table_file,
                            hmm, partial_input_file]

            t = multiprocessing.Process(target=self.hmmer_worker, args=(partial_input_file,
                                                       cmd_line,
                                                       table_file,
                                                       output_file,
                                                       desired_output,
                                                       log_file,
                                                       output_queue,
                                                       ret_value_queue))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update(f'Running {self.program_to_use} in {P("thread", self.num_threads_to_use)}...')

        finished_workers = 0
        while finished_workers < self.num_threads_to_use:
            try:
                ret_value = ret_value_queue.get()

                if isinstance(ret_value, Exception):
                    # If thread returns an exception, we raise it and kill the main thread.
                    raise ret_value

                finished_workers += 1
                if ret_value == 0:
                    if anvio.DEBUG:
                        self.run.info_single(f"{finished_workers} out of {self.num_threads_to_use} have finished")
                else:
                    raise ConfigError("An HMMER worker thread came back with an unexpected return value of {ret_value}. "
                                      "Something is probably wrong, so you should contact a developer for help.")

                # if worker finished successfully we can take its individual output file(s) and append them to the main file(s)
                output_dict = output_queue.get()
                for file_type, file in output_dict.items():
                    main_file_buffer = merged_files_dict[file_type]['buffer']
                    main_file_lock = merged_files_dict[file_type]['lock']
                    worker_file = file
                    if file_type == 'table':
                        append_function = self.append_to_main_table_file
                    elif file_type == 'standard':
                        append_function = self.append_to_main_standard_file

                    append_function(main_file_buffer, worker_file, main_file_lock)

            except KeyboardInterrupt:
                self.run.info_single("HMMER driver received SIGINT, terminating all threads...", nl_before=2)
                break

            except Exception as worker_error:
                # An exception was thrown in one of the threads so we kill all of them
                self.progress.end()
                self.run.warning("An exception was thrown in one of the worker threads (see output below for details).")
                for worker in workers:
                    worker.terminate()
                raise worker_error

        for worker in workers:
            worker.terminate()

        output_file_paths = []
        for output in desired_output:
            output_file_path = os.path.join(tmp_dir, f"hmm.{output}")

            with open(output_file_path, 'w') as out:
                merged_files_dict[output]['buffer'].seek(0)
                out.write(merged_files_dict[output]['buffer'].read())

            if output == 'table':
                num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path)
                self.run.info('Number of raw hits', num_raw_hits, progress=self.progress)
                output_file_path = output_file_path if num_raw_hits else None

            output_file_paths.append(output_file_path)

        self.progress.end()

        # Return output path as string if desired_output is len 1. Else return tuple of output paths
        output = output_file_paths[0] if len(output_file_paths) == 1 else tuple(output_file_paths)

        return output
Beispiel #6
0
    def run_hmmscan(self, source, alphabet, context, kind, domain,
                    num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context "
                "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes in HMM model', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)
        if alphabet in ['DNA', 'RNA']:
            self.run.info('HMMer program used for search', 'nhmmscan')
        else:
            self.run.info('HMMer program used for search', self.program_to_use)

        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        self.run.info('Temporary work dir', tmp_dir)

        # check if all hmmpress files are in the HMM directory
        self.verify_hmmpress_output(hmm)

        workers = []
        merged_file_buffer = io.StringIO()
        buffer_write_lock = Lock()

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning(
                "You requested %s cores but there were only %s entries in the fasta for the target '%s'. "
                "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. "
                % (str(self.num_threads_to_use), str(num_parts), target,
                   str(num_parts), cores_per_process))

        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
            self.run.warning(
                "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
                "We hope that is alright." % (self.program_to_use, alphabet))

        thread_num = 0
        for part_file in self.target_files_dict[target]:
            log_file = part_file + '_log'
            output_file = part_file + '_output'
            shitty_file = part_file + '_shitty'

            self.run.info('Log file for thread %s' % thread_num, log_file)
            thread_num += 1

            if noise_cutoff_terms:
                cmd_line = [
                    'nhmmscan'
                    if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                    '-o', output_file, *noise_cutoff_terms.split(), '--cpu',
                    cores_per_process, '--tblout', shitty_file, hmm, part_file
                ]
            else:  # if we didn't pass any noise cutoff terms, here we don't include them in the command line
                cmd_line = [
                    'nhmmscan' if alphabet in ['DNA', 'RNA'] else
                    self.program_to_use, '-o', output_file, '--cpu',
                    cores_per_process, '--tblout', shitty_file, hmm, part_file
                ]

            t = Thread(target=self.hmmscan_worker,
                       args=(part_file, cmd_line, shitty_file, log_file,
                             merged_file_buffer, buffer_write_lock))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update('Running HMM scan in %d threads...' %
                             (self.num_threads_to_use))

        # Wait for all workers to finish.
        for worker in workers:
            worker.join()

        output_file_path = os.path.join(tmp_dir, 'hmm.hits')

        with open(output_file_path, 'w') as out:
            merged_file_buffer.seek(0)
            out.write(merged_file_buffer.read())

        self.progress.end()

        num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path)
        self.run.info('Number of raw hits', num_raw_hits)

        return output_file_path if num_raw_hits else None
Beispiel #7
0
    def run_hmmscan(self,
                    source,
                    genes_in_model,
                    hmm,
                    ref,
                    cut_off_flag="--cut_ga"):
        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' %
                    (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        ret_val = utils.run_command(cmd_line)
        if ret_val:
            raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html')
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \
                                        % (self.hmm_scan_output,
                                           cut_off_flag,
                                           self.num_threads_to_use,
                                           self.hmm_scan_hits_shitty,
                                           hmm_file_path,
                                           self.protein_sequences_fasta,
                                           log_file_path))

        with open(log_file_path, "a") as myfile:
            myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Beispiel #8
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        ret_val = utils.run_command(cmd_line)
        if ret_val:
            raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html')
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \
                                        % (self.hmm_scan_output,
                                           cut_off_flag,
                                           self.num_threads_to_use,
                                           self.hmm_scan_hits_shitty,
                                           hmm_file_path,
                                           self.protein_sequences_fasta,
                                           log_file_path))

        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Beispiel #9
0
    def run_hmmscan(self,
                    source,
                    target,
                    kind,
                    domain,
                    genes_in_model,
                    hmm,
                    ref,
                    cut_off_flag="--cut_ga"):

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Target', target)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER\
                                you have installed is not up-to-date enough. Just to make sure what went\
                                wrong please take a look at the log file ('%s'). Please visit %s to see what\
                                is the latest version availalbe. You can learn which version of HMMER you have\
                                on your system by typing 'hmmpress -h'"\
                                        % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = [
            'hmmscan', '-o', self.hmm_scan_output, cut_off_flag, '--cpu',
            self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty,
            hmm_file_path, self.target_files_dict[target]
        ]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError(
                "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." %
                log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Beispiel #10
0
    def run_hmmer(self,
                  source,
                  alphabet,
                  context,
                  kind,
                  domain,
                  num_genes_in_model,
                  hmm,
                  ref,
                  noise_cutoff_terms,
                  desired_output='table',
                  out_fmt='--tblout'):
        """Run the program

        Parameters
        ==========
        source : str
            A name for your HMM effort.

        alphabet : str
            Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'}

        context : str
            This will determine how your output is processed. FIXME Documentation is lacking. Choose
            from {'GENE', 'CONTIG', 'DOMAIN'}.

        kind : str
            Used for user stdout info. Don't by afraid to pass None

        domain : str
            Used for user stdout info. Don't by afraid to pass None

        num_genes_in_model : int
            Used for user stdout info. Don't by afraid to pass None

        hmm : str
            Path to the input .hmm file

        ref : int
            Used for user stdout info. Don't by afraid to pass None

        noise_cutoff_terms : str
            Filter out hits with built-in flags. e.g. '--cut_ga'

        desired_output : str OR list, 'table'
            HMMER programs have a couple of outputs. For the standard output (specified by the hmmer
            program flag `-o`), pass 'standard'. For the tabular output (specified by the hmmer
            program flag `--tblout` or `--domtblout`), pass 'table'. If you want to use both, pass
            ('standard', 'table')

        out_fmt : str, '--tblout'
            HMMer programs have different table output formats. For example, choose from --tblout or
            --domtblout.
        """

        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context "
                "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        if isinstance(desired_output, str):
            desired_output = (desired_output, )

        for output in desired_output:
            if output not in ['standard', 'table']:
                raise ConfigError(
                    "HMMer.run_hmmer :: Unknown desired_output, '%s'" % output)

        if out_fmt not in ['--tblout', '--domtblout']:
            raise ConfigError("HMMer.run_hmmer :: Unknown out_fmt, '%s'" %
                              out_fmt)

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N/A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes in HMM model', num_genes_in_model
                      or 'unknown')
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)
        if alphabet in ['DNA', 'RNA']:
            self.run.info('HMMer program used for search', 'nhmmscan')
        else:
            self.run.info('HMMer program used for search', self.program_to_use)

        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        self.run.info('Temporary work dir', tmp_dir)

        # check if all hmmpress files are in the HMM directory
        self.verify_hmmpress_output(hmm)

        workers = []

        # Holds buffer and write lock for each output
        merged_files_dict = {}
        for output in desired_output:
            merged_files_dict[output] = {
                'buffer': io.StringIO(),
                'lock': Lock()
            }

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning(
                "You requested %s cores but there were only %s entries in the fasta for the target '%s'. "
                "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. "
                % (str(self.num_threads_to_use), str(num_parts), target,
                   str(num_parts), cores_per_process))

        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
            self.run.warning(
                "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
                "We hope that is alright." % (self.program_to_use, alphabet))

        thread_num = 0
        for partial_input_file in self.target_files_dict[target]:
            log_file = partial_input_file + '_log'
            output_file = partial_input_file + '_output'
            table_file = partial_input_file + '_table'

            self.run.info('Log file for thread %s' % thread_num, log_file)
            thread_num += 1

            if noise_cutoff_terms:
                cmd_line = [
                    'nhmmscan' if alphabet in ['DNA', 'RNA'] else
                    self.program_to_use, '-o', output_file,
                    *noise_cutoff_terms.split(), '--cpu', cores_per_process,
                    out_fmt, table_file, hmm, partial_input_file
                ]
            else:  # if we didn't pass any noise cutoff terms, here we don't include them in the command line
                cmd_line = [
                    'nhmmscan'
                    if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                    '-o', output_file, '--cpu', cores_per_process, out_fmt,
                    table_file, hmm, partial_input_file
                ]

            t = Thread(target=self.hmmer_worker,
                       args=(partial_input_file, cmd_line, table_file,
                             output_file, desired_output, log_file,
                             merged_files_dict))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update('Running %s in %d threads...' %
                             (self.program_to_use, self.num_threads_to_use))

        # Wait for all workers to finish.
        for worker in workers:
            worker.join()

        output_file_paths = []
        for output in desired_output:
            output_file_path = os.path.join(tmp_dir, f"hmm.{output}")

            with open(output_file_path, 'w') as out:
                merged_files_dict[output]['buffer'].seek(0)
                out.write(merged_files_dict[output]['buffer'].read())

            if output == 'table':
                num_raw_hits = filesnpaths.get_num_lines_in_file(
                    output_file_path)
                self.run.info('Number of raw hits', num_raw_hits)
                output_file_path = output_file_path if num_raw_hits else None

            output_file_paths.append(output_file_path)

        self.progress.end()

        # Return output path as string if desired_output is len 1. Else return tuple of output paths
        output = output_file_paths[0] if len(
            output_file_paths) == 1 else tuple(output_file_paths)
        return output
Beispiel #11
0
    def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context\
                                to clarify whether the HMM search is supposed to be done using alphabets DNA,\
                                RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\
                                doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target)

        self.run.warning('', header='HMM Profiling for %s' % source, lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\
                               installed is either not up-to-date enough, or too new :/ Just to make sure what went\
                               wrong please take a look at the log file ('%s'). Please visit %s to see what\
                               is the latest version availalbe if you think updating HMMER can resolve it. You can\
                               learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')

        cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan',
                    '-o', self.hmm_scan_output, *noise_cutoff_terms.split(),
                    '--cpu', self.num_threads_to_use,
                    '--tblout', self.hmm_scan_hits_shitty,
                    hmm_file_path, self.target_files_dict[target]]

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            self.progress.end()
            raise ConfigError("Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path)

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        
        detected_non_ascii = False
        lines_with_non_ascii = []

        with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file:
            line_counter = 0
            for line_bytes in hmm_hits_file:
                line_counter += 1
                line = line_bytes.decode('ascii', 'ignore')

                if not len(line) == len(line_bytes):
                    lines_with_non_ascii.append(line_counter)
                    detected_non_ascii = True

                if line.startswith('#'):
                    continue
            
                parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        
        parseable_output.close()

        if detected_non_ascii:
            self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \
                the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\
                You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % 
                (self.hmm_scan_hits_shitty, ", ".join(map(str, lines_with_non_ascii))))

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None
Beispiel #12
0
    def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms):
        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context "
                               "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                               "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                               "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target)

        self.run.warning('', header='HMM Profiling for %s' % source, lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N\\A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes', num_genes_in_model)
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)

        # we want to create hmm files in the same direcotry
        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt')
        hmm_file = open(hmm_file_path, 'wb')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')

        cmd_line = ['hmmpress', hmm_file_path]
        ret_val = utils.run_command(cmd_line, log_file_path)

        if ret_val:
            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have "
                              "installed is either not up-to-date enough, or too new :/ Just to make sure what went "
                              "wrong please take a look at the log file ('%s'). Please visit %s to see what "
                              "is the latest version availalbe if you think updating HMMER can resolve it. You can "
                              "learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
        self.progress.end()



        workers = []
        merged_file_buffer = io.StringIO()
        buffer_write_lock = Lock()

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning("You requested %s cores but there were only %s entries in the fasta for the target '%s'. "
                            "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " %
                             (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process))


        for part_file in self.target_files_dict[target]:
            log_file = part_file + '_log'
            output_file = part_file + '_output'
            shitty_file = part_file + '_shitty'

            cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan',
                        '-o', output_file, *noise_cutoff_terms.split(),
                        '--cpu', cores_per_process,
                        '--tblout', shitty_file,
                        hmm_file_path, part_file]

            t = Thread(target=self.hmmscan_worker, args=(part_file,
                                                         cmd_line,
                                                         shitty_file,
                                                         log_file,
                                                         merged_file_buffer,
                                                         buffer_write_lock))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan in %d threads...' % (self.num_threads_to_use))

        # Wait for all workers to finish.
        for worker in workers:
            worker.join()

        output_file_path = os.path.join(tmp_dir, 'hmm.hits')

        with open(output_file_path, 'w') as out:
            merged_file_buffer.seek(0)
            out.write(merged_file_buffer.read())

        self.progress.end()

        num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path)
        self.run.info('Number of raw hits', num_raw_hits)

        return output_file_path if num_raw_hits else None
Beispiel #13
0
    def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"):
        self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Pfam model', hmm)
        self.run.info('Number of genes', len(genes_in_model))

        tmp_dir = filesnpaths.get_temp_directory_path()
        self.tmp_dirs.append(tmp_dir)

        self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output')
        self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits')
        self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty')
        log_file_path = os.path.join(tmp_dir, '00_log.txt')

        self.run.info('Temporary work dir', tmp_dir)
        self.run.info('HMM scan output', self.hmm_scan_output)
        self.run.info('HMM scan hits', self.hmm_scan_hits)
        self.run.info('Log file', log_file_path)

        self.progress.new('Unpacking the model into temporary work directory')
        self.progress.update('...')
        hmm_file_path = os.path.join(tmp_dir, 'hmm.txt')
        hmm_file = open(hmm_file_path, 'w')
        hmm_file.write(gzip.open(hmm, 'rb').read())
        hmm_file.close()
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Compressing the pfam model')
        cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)
        self.progress.end()

        self.progress.new('Processing')
        self.progress.update('Performing HMM scan ...')
        cmd_line = ('hmmscan -o "%s" %s --tblout "%s" "%s" "%s" >> "%s" 2>&1' % (self.hmm_scan_output,
                                                                              cut_off_flag,
                                                                              self.hmm_scan_hits_shitty,
                                                                              hmm_file_path,
                                                                              self.proteins_in_contigs,
                                                                              log_file_path))
        with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n')
        utils.run_command(cmd_line)

        if not os.path.exists(self.hmm_scan_hits_shitty):
            raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\
                                expected output :/ Fortunately, this log file should tell you what\
                                might be the problem: '%s'. Please do not forget to include this\
                                file if you were to ask for help." % log_file_path

        self.progress.end()

        # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers
        # love to write little hacks like this into our code:
        parseable_output = open(self.hmm_scan_hits, 'w')
        for line in open(self.hmm_scan_hits_shitty).readlines():
            if line.startswith('#'):
                continue
            parseable_output.write('\t'.join(line.split()[0:18]) + '\n')
        parseable_output.close()

        num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits)
        self.run.info('Number of raw hits', num_raw_hits)

        return self.hmm_scan_hits if num_raw_hits else None