Ejemplo n.º 1
0
    def query(self, a3m: str) -> str:
        """Queries the database using HHsearch using a given a3m."""
        with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
            input_path = os.path.join(query_tmp_dir, 'query.a3m')
            hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
            with open(input_path, 'w') as f:
                f.write(a3m)

            db_cmd = []
            for db_path in self.databases:
                db_cmd.append('-d')
                db_cmd.append(db_path)
            cmd = [
                self.binary_path, '-i', input_path, '-o', hhr_path, '-maxseq',
                str(self.maxseq)
            ] + db_cmd

            logger.info('Launching subprocess "%s"', ' '.join(cmd))
            process = subprocess.Popen(cmd,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            with utils.timing('HHsearch query'):
                stdout, stderr = process.communicate()
                retcode = process.wait()

            if retcode:
                # Stderr is truncated to prevent proto size errors in Beam.
                raise RuntimeError(
                    'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' %
                    (stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))

            with open(hhr_path) as f:
                hhr = f.read()
        return hhr
Ejemplo n.º 2
0
  def query(self, input_fasta_path: str) -> Mapping[str, Any]:
    """Queries the database using HHblits."""
    with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
      a3m_path = os.path.join(query_tmp_dir, 'output.a3m')

      db_cmd = []
      for db_path in self.databases:
        db_cmd.append('-d')
        db_cmd.append(db_path)
      cmd = [
          self.binary_path,
          '-i', input_fasta_path,
          '-cpu', str(self.n_cpu),
          '-oa3m', a3m_path,
          '-o', '/dev/null',
          '-n', str(self.n_iter),
          '-e', str(self.e_value),
          '-maxseq', str(self.maxseq),
          '-realign_max', str(self.realign_max),
          '-maxfilt', str(self.maxfilt),
          '-min_prefilter_hits', str(self.min_prefilter_hits)]
      if self.all_seqs:
        cmd += ['-all']
      if self.alt:
        cmd += ['-alt', str(self.alt)]
      if self.p != _HHBLITS_DEFAULT_P:
        cmd += ['-p', str(self.p)]
      if self.z != _HHBLITS_DEFAULT_Z:
        cmd += ['-Z', str(self.z)]
      cmd += db_cmd

      logger.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(
          cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

      with utils.timing('HHblits query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()

      if retcode:
        # Logs have a 15k character limit, so log HHblits error line by line.
        logger.error('HHblits failed. HHblits stderr begin:')
        for error_line in stderr.decode('utf-8').splitlines():
          if error_line.strip():
            logger.error(error_line.strip())
        logger.error('HHblits stderr end')
        raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
            stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))

      with open(a3m_path) as f:
        a3m = f.read()

    raw_output = dict(
        a3m=a3m,
        output=stdout,
        stderr=stderr,
        n_iter=self.n_iter,
        e_value=self.e_value)
    return raw_output
Ejemplo n.º 3
0
  def _build_profile(self, msa: str, model_construction: str = 'fast') -> str:
    """Builds a HMM for the aligned sequences given as an MSA string.

    Args:
      msa: A string with the aligned sequences, in A3M or STO format.
      model_construction: Whether to use reference annotation in the msa to
        determine consensus columns ('hand') or default ('fast').

    Returns:
      A string with the profile in the HMM format.

    Raises:
      RuntimeError: If hmmbuild fails.
      ValueError: If unspecified arguments are provided.
    """
    if model_construction not in {'hand', 'fast'}:
      raise ValueError(f'Invalid model_construction {model_construction} - only'
                       'hand and fast supported.')

    with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
      input_query = os.path.join(query_tmp_dir, 'query.msa')
      output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')

      with open(input_query, 'w') as f:
        f.write(msa)

      cmd = [self.binary_path]
      # If adding flags, we have to do so before the output and input:

      if model_construction == 'hand':
        cmd.append(f'--{model_construction}')
      if self.singlemx:
        cmd.append('--singlemx')
      cmd.extend([
          '--amino',
          output_hmm_path,
          input_query,
      ])

      logger.info('Launching subprocess %s', cmd)
      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

      with utils.timing('hmmbuild query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()
        logger.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
                    stdout.decode('utf-8'), stderr.decode('utf-8'))

      if retcode:
        raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(output_hmm_path, encoding='utf-8') as f:
        hmm = f.read()

    return hmm
Ejemplo n.º 4
0
  def align(self, sequences: Sequence[str]) -> str:
    """Aligns the sequences and returns the alignment in A3M string.

    Args:
      sequences: A list of query sequence strings. The sequences have to be at
        least 6 residues long (Kalign requires this). Note that the order in
        which you give the sequences might alter the output slightly as
        different alignment tree might get constructed.

    Returns:
      A string with the alignment in a3m format.

    Raises:
      RuntimeError: If Kalign fails.
      ValueError: If any of the sequences is less than 6 residues long.
    """
    logger.info('Aligning %d sequences', len(sequences))

    for s in sequences:
      if len(s) < 6:
        raise ValueError('Kalign requires all sequences to be at least 6 '
                         'residues long. Got %s (%d residues).' % (s, len(s)))

    with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
      input_fasta_path = os.path.join(query_tmp_dir, 'input.fasta')
      output_a3m_path = os.path.join(query_tmp_dir, 'output.a3m')

      with open(input_fasta_path, 'w') as f:
        f.write(_to_a3m(sequences))

      cmd = [
          self.binary_path,
          '-i', input_fasta_path,
          '-o', output_a3m_path,
          '-format', 'fasta',
      ]

      logger.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

      with utils.timing('Kalign query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()
        logger.info('Kalign stdout:\n%s\n\nstderr:\n%s\n',
                    stdout.decode('utf-8'), stderr.decode('utf-8'))

      if retcode:
        raise RuntimeError('Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n'
                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(output_a3m_path) as f:
        a3m = f.read()

      return a3m
Ejemplo n.º 5
0
    def query(self, hmm: str) -> str:
        """Queries the database using hmmsearch using a given hmm."""
        with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
            hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm')
            a3m_out_path = os.path.join(query_tmp_dir, 'output.a3m')
            with open(hmm_input_path, 'w') as f:
                f.write(hmm)

            cmd = [
                self.binary_path,
                '--noali',  # Don't include the alignment in stdout.
                '--cpu',
                '8'
            ]
            # If adding flags, we have to do so before the output and input:
            if self.flags:
                cmd.extend(self.flags)
            cmd.extend([
                '-A',
                a3m_out_path,
                hmm_input_path,
                self.database_path,
            ])

            logger.info('Launching sub-process %s', cmd)
            process = subprocess.Popen(cmd,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            with utils.timing(
                    f'hmmsearch ({os.path.basename(self.database_path)}) query'
            ):
                stdout, stderr = process.communicate()
                retcode = process.wait()

            if retcode:
                raise RuntimeError(
                    'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' %
                    (stdout.decode('utf-8'), stderr.decode('utf-8')))

            with open(a3m_out_path) as f:
                a3m_out = f.read()

        return a3m_out
Ejemplo n.º 6
0
    def _query_chunk(self, input_fasta_path: str,
                     database_path: str) -> Mapping[str, Any]:
        """Queries the database chunk using Jackhmmer."""
        with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
            sto_path = os.path.join(query_tmp_dir, 'output.sto')

            # The F1/F2/F3 are the expected proportion to pass each of the filtering
            # stages (which get progressively more expensive), reducing these
            # speeds up the pipeline at the expensive of sensitivity.  They are
            # currently set very low to make querying Mgnify run in a reasonable
            # amount of time.
            cmd_flags = [
                # Don't pollute stdout with Jackhmmer output.
                '-o',
                '/dev/null',
                '-A',
                sto_path,
                '--noali',
                '--F1',
                str(self.filter_f1),
                '--F2',
                str(self.filter_f2),
                '--F3',
                str(self.filter_f3),
                '--incE',
                str(self.e_value),
                # Report only sequences with E-values <= x in per-sequence output.
                '-E',
                str(self.e_value),
                '--cpu',
                str(self.n_cpu),
                '-N',
                str(self.n_iter)
            ]
            if self.get_tblout:
                tblout_path = os.path.join(query_tmp_dir, 'tblout.txt')
                cmd_flags.extend(['--tblout', tblout_path])

            if self.z_value:
                cmd_flags.extend(['-Z', str(self.z_value)])

            if self.dom_e is not None:
                cmd_flags.extend(['--domE', str(self.dom_e)])

            if self.incdom_e is not None:
                cmd_flags.extend(['--incdomE', str(self.incdom_e)])

            cmd = [self.binary_path
                   ] + cmd_flags + [input_fasta_path, database_path]

            logger.info('Launching subprocess "%s"', ' '.join(cmd))
            process = subprocess.Popen(cmd,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            with utils.timing(
                    f'Jackhmmer ({os.path.basename(database_path)}) query'):
                _, stderr = process.communicate()
                retcode = process.wait()

            if retcode:
                raise RuntimeError('Jackhmmer failed\nstderr:\n%s\n' %
                                   stderr.decode('utf-8'))

            # Get e-values for each target name
            tbl = ''
            if self.get_tblout:
                with open(tblout_path) as f:
                    tbl = f.read()

            with open(sto_path) as f:
                sto = f.read()

        raw_output = dict(sto=sto,
                          tbl=tbl,
                          stderr=stderr,
                          n_iter=self.n_iter,
                          e_value=self.e_value)

        return raw_output