def query(self, a3m: str) -> str: """Queries the database using HHsearch using a given a3m.""" with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: input_path = os.path.join(query_tmp_dir, 'query.a3m') hhr_path = os.path.join(query_tmp_dir, 'output.hhr') with open(input_path, 'w') as f: f.write(a3m) db_cmd = [] for db_path in self.databases: db_cmd.append('-d') db_cmd.append(db_path) cmd = [ self.binary_path, '-i', input_path, '-o', hhr_path, '-maxseq', str(self.maxseq) ] + db_cmd logger.info('Launching subprocess "%s"', ' '.join(cmd)) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing('HHsearch query'): stdout, stderr = process.communicate() retcode = process.wait() if retcode: # Stderr is truncated to prevent proto size errors in Beam. raise RuntimeError( 'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr[:100_000].decode('utf-8'))) with open(hhr_path) as f: hhr = f.read() return hhr
def query(self, input_fasta_path: str) -> Mapping[str, Any]: """Queries the database using HHblits.""" with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: a3m_path = os.path.join(query_tmp_dir, 'output.a3m') db_cmd = [] for db_path in self.databases: db_cmd.append('-d') db_cmd.append(db_path) cmd = [ self.binary_path, '-i', input_fasta_path, '-cpu', str(self.n_cpu), '-oa3m', a3m_path, '-o', '/dev/null', '-n', str(self.n_iter), '-e', str(self.e_value), '-maxseq', str(self.maxseq), '-realign_max', str(self.realign_max), '-maxfilt', str(self.maxfilt), '-min_prefilter_hits', str(self.min_prefilter_hits)] if self.all_seqs: cmd += ['-all'] if self.alt: cmd += ['-alt', str(self.alt)] if self.p != _HHBLITS_DEFAULT_P: cmd += ['-p', str(self.p)] if self.z != _HHBLITS_DEFAULT_Z: cmd += ['-Z', str(self.z)] cmd += db_cmd logger.info('Launching subprocess "%s"', ' '.join(cmd)) process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing('HHblits query'): stdout, stderr = process.communicate() retcode = process.wait() if retcode: # Logs have a 15k character limit, so log HHblits error line by line. logger.error('HHblits failed. HHblits stderr begin:') for error_line in stderr.decode('utf-8').splitlines(): if error_line.strip(): logger.error(error_line.strip()) logger.error('HHblits stderr end') raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % ( stdout.decode('utf-8'), stderr[:500_000].decode('utf-8'))) with open(a3m_path) as f: a3m = f.read() raw_output = dict( a3m=a3m, output=stdout, stderr=stderr, n_iter=self.n_iter, e_value=self.e_value) return raw_output
def _build_profile(self, msa: str, model_construction: str = 'fast') -> str: """Builds a HMM for the aligned sequences given as an MSA string. Args: msa: A string with the aligned sequences, in A3M or STO format. model_construction: Whether to use reference annotation in the msa to determine consensus columns ('hand') or default ('fast'). Returns: A string with the profile in the HMM format. Raises: RuntimeError: If hmmbuild fails. ValueError: If unspecified arguments are provided. """ if model_construction not in {'hand', 'fast'}: raise ValueError(f'Invalid model_construction {model_construction} - only' 'hand and fast supported.') with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: input_query = os.path.join(query_tmp_dir, 'query.msa') output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm') with open(input_query, 'w') as f: f.write(msa) cmd = [self.binary_path] # If adding flags, we have to do so before the output and input: if model_construction == 'hand': cmd.append(f'--{model_construction}') if self.singlemx: cmd.append('--singlemx') cmd.extend([ '--amino', output_hmm_path, input_query, ]) logger.info('Launching subprocess %s', cmd) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing('hmmbuild query'): stdout, stderr = process.communicate() retcode = process.wait() logger.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n', stdout.decode('utf-8'), stderr.decode('utf-8')) if retcode: raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr.decode('utf-8'))) with open(output_hmm_path, encoding='utf-8') as f: hmm = f.read() return hmm
def align(self, sequences: Sequence[str]) -> str: """Aligns the sequences and returns the alignment in A3M string. Args: sequences: A list of query sequence strings. The sequences have to be at least 6 residues long (Kalign requires this). Note that the order in which you give the sequences might alter the output slightly as different alignment tree might get constructed. Returns: A string with the alignment in a3m format. Raises: RuntimeError: If Kalign fails. ValueError: If any of the sequences is less than 6 residues long. """ logger.info('Aligning %d sequences', len(sequences)) for s in sequences: if len(s) < 6: raise ValueError('Kalign requires all sequences to be at least 6 ' 'residues long. Got %s (%d residues).' % (s, len(s))) with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: input_fasta_path = os.path.join(query_tmp_dir, 'input.fasta') output_a3m_path = os.path.join(query_tmp_dir, 'output.a3m') with open(input_fasta_path, 'w') as f: f.write(_to_a3m(sequences)) cmd = [ self.binary_path, '-i', input_fasta_path, '-o', output_a3m_path, '-format', 'fasta', ] logger.info('Launching subprocess "%s"', ' '.join(cmd)) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing('Kalign query'): stdout, stderr = process.communicate() retcode = process.wait() logger.info('Kalign stdout:\n%s\n\nstderr:\n%s\n', stdout.decode('utf-8'), stderr.decode('utf-8')) if retcode: raise RuntimeError('Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr.decode('utf-8'))) with open(output_a3m_path) as f: a3m = f.read() return a3m
def query(self, hmm: str) -> str: """Queries the database using hmmsearch using a given hmm.""" with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm') a3m_out_path = os.path.join(query_tmp_dir, 'output.a3m') with open(hmm_input_path, 'w') as f: f.write(hmm) cmd = [ self.binary_path, '--noali', # Don't include the alignment in stdout. '--cpu', '8' ] # If adding flags, we have to do so before the output and input: if self.flags: cmd.extend(self.flags) cmd.extend([ '-A', a3m_out_path, hmm_input_path, self.database_path, ]) logger.info('Launching sub-process %s', cmd) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing( f'hmmsearch ({os.path.basename(self.database_path)}) query' ): stdout, stderr = process.communicate() retcode = process.wait() if retcode: raise RuntimeError( 'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr.decode('utf-8'))) with open(a3m_out_path) as f: a3m_out = f.read() return a3m_out
def _query_chunk(self, input_fasta_path: str, database_path: str) -> Mapping[str, Any]: """Queries the database chunk using Jackhmmer.""" with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir: sto_path = os.path.join(query_tmp_dir, 'output.sto') # The F1/F2/F3 are the expected proportion to pass each of the filtering # stages (which get progressively more expensive), reducing these # speeds up the pipeline at the expensive of sensitivity. They are # currently set very low to make querying Mgnify run in a reasonable # amount of time. cmd_flags = [ # Don't pollute stdout with Jackhmmer output. '-o', '/dev/null', '-A', sto_path, '--noali', '--F1', str(self.filter_f1), '--F2', str(self.filter_f2), '--F3', str(self.filter_f3), '--incE', str(self.e_value), # Report only sequences with E-values <= x in per-sequence output. '-E', str(self.e_value), '--cpu', str(self.n_cpu), '-N', str(self.n_iter) ] if self.get_tblout: tblout_path = os.path.join(query_tmp_dir, 'tblout.txt') cmd_flags.extend(['--tblout', tblout_path]) if self.z_value: cmd_flags.extend(['-Z', str(self.z_value)]) if self.dom_e is not None: cmd_flags.extend(['--domE', str(self.dom_e)]) if self.incdom_e is not None: cmd_flags.extend(['--incdomE', str(self.incdom_e)]) cmd = [self.binary_path ] + cmd_flags + [input_fasta_path, database_path] logger.info('Launching subprocess "%s"', ' '.join(cmd)) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with utils.timing( f'Jackhmmer ({os.path.basename(database_path)}) query'): _, stderr = process.communicate() retcode = process.wait() if retcode: raise RuntimeError('Jackhmmer failed\nstderr:\n%s\n' % stderr.decode('utf-8')) # Get e-values for each target name tbl = '' if self.get_tblout: with open(tblout_path) as f: tbl = f.read() with open(sto_path) as f: sto = f.read() raw_output = dict(sto=sto, tbl=tbl, stderr=stderr, n_iter=self.n_iter, e_value=self.e_value) return raw_output