def parse_output(self): """Parses the tRNAScan-SE output file Parameters ========== N\A Returns ======= results: `dict` A Dictionary of hits """ num_lines = filesnpaths.get_num_lines_in_file(self.output_file_path) if not num_lines: self.run.warning("No tRNA genes found in tRNAScan-SE output.") return {} d = {} self.progress.new("Parsing the output ...") with open(self.output_file_path) as output: # first three lines are garbage for i in range(0, 3): output.readline() entry_no = 0 while 1: self.progress.update(entry_no) line = output.readline().strip('\n') if not line: break entry_no += 1 fields = [f.strip() for f in line.split('\t')] if not len(fields) == 10: raise ConfigError("The expected output of tRNAScan-SE includes exactly 10 columns. However, the output\ anvi'o is working contains at least one line with %d columns :/ This doesn't look\ good. Here is the list of columns data of that line for your reference: '%s'." \ % (len(fields), fields)) d[entry_no] = { 'contig': fields[0], 'trna_no': fields[1], 'start': int(fields[2]), 'stop': int(fields[3]), 'amino_acid': fields[4], 'codon': fields[5], 'score': float(fields[8]) } self.progress.end() self.run.info("Num tRNA genes parsed", entry_no) return d
def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path): num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path) def raise_error(line_num, line_content, fields, e): raise ConfigError(f"Bad news :( While parsing a COG input file, anvi'o encountered an error (which said: [{e}]) " f"while processing the line {line_counter} in your file. Where the fields in that file looked " f"looked like this: {fields}. Sadly, this has been a long-standing and very annoying issue that " f"anvi'o developers were unable to reproduce. But we recently learned that the issue is likely due " f"to your internet speed (https://github.com/merenlab/anvio/issues/1738). Slower connections lead " f"to broken connections with the NCBI servers, and leave you with an unfinished file :/ The only " f"working solution so far is to try again with a faster internet connection.") progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file) p_id_to_cog_id = {} line_counter = 0 for line in open(input_file_path, 'rU').readlines(): line_counter += 1 if line_counter % 500 == 0: self.progress.increment(line_counter) progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%") fields = line.strip('\n').split(',') # `p_id` should look just like the FASTA ids, and its location has changed between # 2014 release and 2020 release. if self.COG_version == 'COG14': try: p_id = fields[0] COG = fields[6] except Exception as e: raise_error(line_counter, line, fields, e) elif self.COG_version == 'COG20': try: p_id = fields[2].replace('.', '_') COG = fields[6] except Exception as e: raise_error(line_counter, line, fields, e) else: raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper " "parsing of a new generation of COG files.") self.cogs_found_in_proteins_fasta.add(COG) if p_id in p_id_to_cog_id: if COG not in p_id_to_cog_id[p_id]: p_id_to_cog_id[p_id].append(COG) else: p_id_to_cog_id[p_id] = [COG] progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).") dictio.write_serialized_object(p_id_to_cog_id, output_file_path) progress.end()
def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path): num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path) progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file) p_id_to_cog_id = {} line_counter = 0 for line in open(input_file_path, 'rU').readlines(): line_counter += 1 if line_counter % 500 == 0: self.progress.increment(line_counter) progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%") fields = line.strip('\n').split(',') # `p_id` should look just like the FASTA ids, and its location has changed between # 2014 release and 2020 release. if self.COG_version == 'COG14': p_id = fields[0] COG = fields[6] elif self.COG_version == 'COG20': p_id = fields[2].replace('.', '_') COG = fields[6] else: raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper " "parsing of a new generation of COG files.") self.cogs_found_in_proteins_fasta.add(COG) if p_id in p_id_to_cog_id: if COG not in p_id_to_cog_id[p_id]: p_id_to_cog_id[p_id].append(COG) else: p_id_to_cog_id[p_id] = [COG] progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).") dictio.write_serialized_object(p_id_to_cog_id, output_file_path) progress.end()
def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError( "You have an unknown target :/ Target, which defines an alphabet and context\ to clarify whether the HMM search is supposed to be done using alphabets DNA,\ RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\ doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError( "HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) self.run.info('Number of genes', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ installed is either not up-to-date enough, or too new :/ Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe if you think updating HMMER can resolve it. You can\ learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o', self.hmm_scan_output, *noise_cutoff_terms.split(), '--cpu', self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty, hmm_file_path, self.target_files_dict[target] ] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.hmm_scan_hits_shitty): self.progress.end() raise ConfigError( "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') detected_non_ascii = False lines_with_non_ascii = [] with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file: line_counter = 0 for line_bytes in hmm_hits_file: line_counter += 1 line = line_bytes.decode('ascii', 'ignore') if not len(line) == len(line_bytes): lines_with_non_ascii.append(line_counter) detected_non_ascii = True if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() if detected_non_ascii: self.run.warning( "Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \ the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\ You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % (self.hmm_scan_hits_shitty, ", ".join( map(str, lines_with_non_ascii)))) num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmer(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, desired_output='table', out_fmt='--tblout'): """Run the program Parameters ========== source : str A name for your HMM effort. alphabet : str Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'} context : str This will determine how your output is processed. FIXME Documentation is lacking. Choose from {'GENE', 'CONTIG', 'DOMAIN'}. kind : str Used for user stdout info. Don't by afraid to pass None domain : str Used for user stdout info. Don't by afraid to pass None num_genes_in_model : int Used for user stdout info. Don't by afraid to pass None hmm : str Path to the input .hmm file ref : int Used for user stdout info. Don't by afraid to pass None noise_cutoff_terms : str Filter out hits with built-in flags. e.g. '--cut_ga' desired_output : str OR list, 'table' HMMER programs have a couple of outputs. For the standard output (specified by the hmmer program flag `-o`), pass 'standard'. For the tabular output (specified by the hmmer program flag `--tblout` or `--domtblout`), pass 'table'. If you want to use both, pass ('standard', 'table') out_fmt : str, '--tblout' HMMer programs have different table output formats. For example, choose from --tblout or --domtblout. """ target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context " "to clarify whether the HMM search is supposed to be done using alphabets DNA, " "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it " "doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target) if isinstance(desired_output, str): desired_output = (desired_output, ) for output in desired_output: if output not in ['standard', 'table']: raise ConfigError("HMMer.run_hmmer :: Unknown desired_output, '%s'" % output) if out_fmt not in ['--tblout', '--domtblout']: raise ConfigError("HMMer.run_hmmer :: Unknown out_fmt, '%s'" % out_fmt) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N/A') self.run.info('HMM model path', hmm) self.run.info('Number of genes in HMM model', num_genes_in_model or 'unknown') self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) if alphabet in ['DNA', 'RNA']: self.run.info('HMMer program used for search', 'nhmmscan') else: self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) self.run.info('Temporary work dir', tmp_dir) # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) workers = [] manager = multiprocessing.Manager() # this dude holds the shared objects that will be modified by workers ret_value_queue = manager.Queue(maxsize=self.num_threads_to_use) output_queue = manager.Queue() # Holds buffer and write lock for each output merged_files_dict = {} for output in desired_output: merged_files_dict[output] = {'buffer': io.StringIO(), 'lock': manager.Lock()} num_parts = len(self.target_files_dict[target]) cores_per_process = 1 if num_parts < self.num_threads_to_use: cores_per_process = self.num_threads_to_use // num_parts self.run.warning(f"You requested {P('core', self.num_threads_to_use)} but there were only {P('sequence', num_parts)} " f"in the FASTA file for the target '{target}'. Anvi'o will use {P('process', num_parts, sfp='es')} " f"with {P('core', cores_per_process)} instead. And that's that.") self.num_threads_to_use = num_parts if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " "We hope that is alright." % (self.program_to_use, alphabet)) thread_num = 0 for partial_input_file in self.target_files_dict[target]: log_file = partial_input_file + '_log' output_file = partial_input_file + '_output' table_file = partial_input_file + '_table' self.run.info('Log file for thread %s' % thread_num, log_file) thread_num += 1 if noise_cutoff_terms: cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, out_fmt, table_file, hmm, partial_input_file] else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, out_fmt, table_file, hmm, partial_input_file] t = multiprocessing.Process(target=self.hmmer_worker, args=(partial_input_file, cmd_line, table_file, output_file, desired_output, log_file, output_queue, ret_value_queue)) t.start() workers.append(t) self.progress.new('Processing') self.progress.update(f'Running {self.program_to_use} in {P("thread", self.num_threads_to_use)}...') finished_workers = 0 while finished_workers < self.num_threads_to_use: try: ret_value = ret_value_queue.get() if isinstance(ret_value, Exception): # If thread returns an exception, we raise it and kill the main thread. raise ret_value finished_workers += 1 if ret_value == 0: if anvio.DEBUG: self.run.info_single(f"{finished_workers} out of {self.num_threads_to_use} have finished") else: raise ConfigError("An HMMER worker thread came back with an unexpected return value of {ret_value}. " "Something is probably wrong, so you should contact a developer for help.") # if worker finished successfully we can take its individual output file(s) and append them to the main file(s) output_dict = output_queue.get() for file_type, file in output_dict.items(): main_file_buffer = merged_files_dict[file_type]['buffer'] main_file_lock = merged_files_dict[file_type]['lock'] worker_file = file if file_type == 'table': append_function = self.append_to_main_table_file elif file_type == 'standard': append_function = self.append_to_main_standard_file append_function(main_file_buffer, worker_file, main_file_lock) except KeyboardInterrupt: self.run.info_single("HMMER driver received SIGINT, terminating all threads...", nl_before=2) break except Exception as worker_error: # An exception was thrown in one of the threads so we kill all of them self.progress.end() self.run.warning("An exception was thrown in one of the worker threads (see output below for details).") for worker in workers: worker.terminate() raise worker_error for worker in workers: worker.terminate() output_file_paths = [] for output in desired_output: output_file_path = os.path.join(tmp_dir, f"hmm.{output}") with open(output_file_path, 'w') as out: merged_files_dict[output]['buffer'].seek(0) out.write(merged_files_dict[output]['buffer'].read()) if output == 'table': num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path) self.run.info('Number of raw hits', num_raw_hits, progress=self.progress) output_file_path = output_file_path if num_raw_hits else None output_file_paths.append(output_file_path) self.progress.end() # Return output path as string if desired_output is len 1. Else return tuple of output paths output = output_file_paths[0] if len(output_file_paths) == 1 else tuple(output_file_paths) return output
def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError( "You have an unknown target :/ Target, which defines an alphabet and context " "to clarify whether the HMM search is supposed to be done using alphabets DNA, " "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it " "doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError( "HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) self.run.info('Number of genes in HMM model', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) if alphabet in ['DNA', 'RNA']: self.run.info('HMMer program used for search', 'nhmmscan') else: self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) self.run.info('Temporary work dir', tmp_dir) # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) workers = [] merged_file_buffer = io.StringIO() buffer_write_lock = Lock() num_parts = len(self.target_files_dict[target]) cores_per_process = 1 if num_parts < self.num_threads_to_use: cores_per_process = self.num_threads_to_use // num_parts self.run.warning( "You requested %s cores but there were only %s entries in the fasta for the target '%s'. " "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process)) if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': self.run.warning( "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " "We hope that is alright." % (self.program_to_use, alphabet)) thread_num = 0 for part_file in self.target_files_dict[target]: log_file = part_file + '_log' output_file = part_file + '_output' shitty_file = part_file + '_shitty' self.run.info('Log file for thread %s' % thread_num, log_file) thread_num += 1 if noise_cutoff_terms: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', shitty_file, hmm, part_file ] else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, '--tblout', shitty_file, hmm, part_file ] t = Thread(target=self.hmmscan_worker, args=(part_file, cmd_line, shitty_file, log_file, merged_file_buffer, buffer_write_lock)) t.start() workers.append(t) self.progress.new('Processing') self.progress.update('Running HMM scan in %d threads...' % (self.num_threads_to_use)) # Wait for all workers to finish. for worker in workers: worker.join() output_file_path = os.path.join(tmp_dir, 'hmm.hits') with open(output_file_path, 'w') as out: merged_file_buffer.seek(0) out.write(merged_file_buffer.read()) self.progress.end() num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path) self.run.info('Number of raw hits', num_raw_hits) return output_file_path if num_raw_hits else None
def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag="--cut_ga"): self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'w') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') ret_val = utils.run_command(cmd_line) if ret_val: raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\ you have installed is not up-to-date enough. Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe. You can learn which version of HMMER you have\ on your system by typing 'hmmpress -h'"\ % (log_file_path, 'http://hmmer.janelia.org/download.html') self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \ % (self.hmm_scan_output, cut_off_flag, self.num_threads_to_use, self.hmm_scan_hits_shitty, hmm_file_path, self.protein_sequences_fasta, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"): self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'w') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') ret_val = utils.run_command(cmd_line) if ret_val: raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\ you have installed is not up-to-date enough. Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe. You can learn which version of HMMER you have\ on your system by typing 'hmmpress -h'"\ % (log_file_path, 'http://hmmer.janelia.org/download.html') self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \ % (self.hmm_scan_output, cut_off_flag, self.num_threads_to_use, self.hmm_scan_hits_shitty, hmm_file_path, self.protein_sequences_fasta, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmscan(self, source, target, kind, domain, genes_in_model, hmm, ref, cut_off_flag="--cut_ga"): if target not in self.target_files_dict: raise ConfigError( "You have an unknown target :/ Target, which defines an alphabet and context\ to clarify whether the HMM search is supposed to be done using alphabets DNA,\ RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\ doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError( "HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Target', target) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError("The last call did not work quite well. Most probably the version of HMMER\ you have installed is not up-to-date enough. Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe. You can learn which version of HMMER you have\ on your system by typing 'hmmpress -h'"\ % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = [ 'hmmscan', '-o', self.hmm_scan_output, cut_off_flag, '--cpu', self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty, hmm_file_path, self.target_files_dict[target] ] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError( "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmer(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, desired_output='table', out_fmt='--tblout'): """Run the program Parameters ========== source : str A name for your HMM effort. alphabet : str Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'} context : str This will determine how your output is processed. FIXME Documentation is lacking. Choose from {'GENE', 'CONTIG', 'DOMAIN'}. kind : str Used for user stdout info. Don't by afraid to pass None domain : str Used for user stdout info. Don't by afraid to pass None num_genes_in_model : int Used for user stdout info. Don't by afraid to pass None hmm : str Path to the input .hmm file ref : int Used for user stdout info. Don't by afraid to pass None noise_cutoff_terms : str Filter out hits with built-in flags. e.g. '--cut_ga' desired_output : str OR list, 'table' HMMER programs have a couple of outputs. For the standard output (specified by the hmmer program flag `-o`), pass 'standard'. For the tabular output (specified by the hmmer program flag `--tblout` or `--domtblout`), pass 'table'. If you want to use both, pass ('standard', 'table') out_fmt : str, '--tblout' HMMer programs have different table output formats. For example, choose from --tblout or --domtblout. """ target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError( "You have an unknown target :/ Target, which defines an alphabet and context " "to clarify whether the HMM search is supposed to be done using alphabets DNA, " "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it " "doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError( "HMMer class does not know about Sequences file for the target %s :/" % target) if isinstance(desired_output, str): desired_output = (desired_output, ) for output in desired_output: if output not in ['standard', 'table']: raise ConfigError( "HMMer.run_hmmer :: Unknown desired_output, '%s'" % output) if out_fmt not in ['--tblout', '--domtblout']: raise ConfigError("HMMer.run_hmmer :: Unknown out_fmt, '%s'" % out_fmt) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N/A') self.run.info('HMM model path', hmm) self.run.info('Number of genes in HMM model', num_genes_in_model or 'unknown') self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) if alphabet in ['DNA', 'RNA']: self.run.info('HMMer program used for search', 'nhmmscan') else: self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) self.run.info('Temporary work dir', tmp_dir) # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) workers = [] # Holds buffer and write lock for each output merged_files_dict = {} for output in desired_output: merged_files_dict[output] = { 'buffer': io.StringIO(), 'lock': Lock() } num_parts = len(self.target_files_dict[target]) cores_per_process = 1 if num_parts < self.num_threads_to_use: cores_per_process = self.num_threads_to_use // num_parts self.run.warning( "You requested %s cores but there were only %s entries in the fasta for the target '%s'. " "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process)) if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': self.run.warning( "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " "We hope that is alright." % (self.program_to_use, alphabet)) thread_num = 0 for partial_input_file in self.target_files_dict[target]: log_file = partial_input_file + '_log' output_file = partial_input_file + '_output' table_file = partial_input_file + '_table' self.run.info('Log file for thread %s' % thread_num, log_file) thread_num += 1 if noise_cutoff_terms: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, out_fmt, table_file, hmm, partial_input_file ] else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, out_fmt, table_file, hmm, partial_input_file ] t = Thread(target=self.hmmer_worker, args=(partial_input_file, cmd_line, table_file, output_file, desired_output, log_file, merged_files_dict)) t.start() workers.append(t) self.progress.new('Processing') self.progress.update('Running %s in %d threads...' % (self.program_to_use, self.num_threads_to_use)) # Wait for all workers to finish. for worker in workers: worker.join() output_file_paths = [] for output in desired_output: output_file_path = os.path.join(tmp_dir, f"hmm.{output}") with open(output_file_path, 'w') as out: merged_files_dict[output]['buffer'].seek(0) out.write(merged_files_dict[output]['buffer'].read()) if output == 'table': num_raw_hits = filesnpaths.get_num_lines_in_file( output_file_path) self.run.info('Number of raw hits', num_raw_hits) output_file_path = output_file_path if num_raw_hits else None output_file_paths.append(output_file_path) self.progress.end() # Return output path as string if desired_output is len 1. Else return tuple of output paths output = output_file_paths[0] if len( output_file_paths) == 1 else tuple(output_file_paths) return output
def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context\ to clarify whether the HMM search is supposed to be done using alphabets DNA,\ RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\ doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) self.run.info('Number of genes', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ installed is either not up-to-date enough, or too new :/ Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe if you think updating HMMER can resolve it. You can\ learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o', self.hmm_scan_output, *noise_cutoff_terms.split(), '--cpu', self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty, hmm_file_path, self.target_files_dict[target]] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.hmm_scan_hits_shitty): self.progress.end() raise ConfigError("Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') detected_non_ascii = False lines_with_non_ascii = [] with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file: line_counter = 0 for line_bytes in hmm_hits_file: line_counter += 1 line = line_bytes.decode('ascii', 'ignore') if not len(line) == len(line_bytes): lines_with_non_ascii.append(line_counter) detected_non_ascii = True if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() if detected_non_ascii: self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \ the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\ You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % (self.hmm_scan_hits_shitty, ", ".join(map(str, lines_with_non_ascii)))) num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context " "to clarify whether the HMM search is supposed to be done using alphabets DNA, " "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it " "doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) self.run.info('Number of genes', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) # we want to create hmm files in the same direcotry tmp_dir = os.path.dirname(self.target_files_dict[target][0]) log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have " "installed is either not up-to-date enough, or too new :/ Just to make sure what went " "wrong please take a look at the log file ('%s'). Please visit %s to see what " "is the latest version availalbe if you think updating HMMER can resolve it. You can " "learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() workers = [] merged_file_buffer = io.StringIO() buffer_write_lock = Lock() num_parts = len(self.target_files_dict[target]) cores_per_process = 1 if num_parts < self.num_threads_to_use: cores_per_process = self.num_threads_to_use // num_parts self.run.warning("You requested %s cores but there were only %s entries in the fasta for the target '%s'. " "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " % (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process)) for part_file in self.target_files_dict[target]: log_file = part_file + '_log' output_file = part_file + '_output' shitty_file = part_file + '_shitty' cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', shitty_file, hmm_file_path, part_file] t = Thread(target=self.hmmscan_worker, args=(part_file, cmd_line, shitty_file, log_file, merged_file_buffer, buffer_write_lock)) t.start() workers.append(t) self.progress.new('Processing') self.progress.update('Performing HMM scan in %d threads...' % (self.num_threads_to_use)) # Wait for all workers to finish. for worker in workers: worker.join() output_file_path = os.path.join(tmp_dir, 'hmm.hits') with open(output_file_path, 'w') as out: merged_file_buffer.seek(0) out.write(merged_file_buffer.read()) self.progress.end() num_raw_hits = filesnpaths.get_num_lines_in_file(output_file_path) self.run.info('Number of raw hits', num_raw_hits) return output_file_path if num_raw_hits else None
def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"): self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'w') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ('hmmscan -o "%s" %s --tblout "%s" "%s" "%s" >> "%s" 2>&1' % (self.hmm_scan_output, cut_off_flag, self.hmm_scan_hits_shitty, hmm_file_path, self.proteins_in_contigs, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None