Example #1
0
    def _parse_qresult(self):
        # initial qresult value
        qresult = None
        hit_rows = []
        # state values
        state_QRES_NEW = 1
        state_QRES_HITTAB = 3
        state_QRES_CONTENT = 5
        state_QRES_END = 7

        while True:

            # one line before the hit table
            if self.line.startswith('The best scores are:'):
                qres_state = state_QRES_HITTAB
            # the end of a query or the file altogether
            elif self.line.strip() == '>>>///' or not self.line:
                qres_state = state_QRES_END
            # the beginning of a new query
            elif not self.line.startswith('>>>') and '>>>' in self.line:
                qres_state = state_QRES_NEW
            # the beginning of the query info and its hits + hsps
            elif self.line.startswith('>>>') and not \
                    self.line.strip() == '>>><<<':
                qres_state = state_QRES_CONTENT
            # default qres mark
            else:
                qres_state = None

            if qres_state is not None:
                if qres_state == state_QRES_HITTAB:
                    # parse hit table if flag is set
                    hit_rows = self.__parse_hit_table()

                elif qres_state == state_QRES_END:
                    yield _set_qresult_hits(qresult, hit_rows)
                    break

                elif qres_state == state_QRES_NEW:
                    # if qresult is filled, yield it first
                    if qresult is not None:
                        yield _set_qresult_hits(qresult, hit_rows)
                    regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
                    query_id = regx.group(1)
                    seq_len = regx.group(3)
                    desc = regx.group(2)
                    qresult = QueryResult(id=query_id)
                    qresult.seq_len = int(seq_len)
                    # get target from the next line
                    self.line = self.handle.readline()
                    qresult.target = [x for x in self.line.split(' ')
                                      if x][1].strip()
                    if desc is not None:
                        qresult.description = desc
                    # set values from preamble
                    for key, value in self._preamble.items():
                        setattr(qresult, key, value)

                elif qres_state == state_QRES_CONTENT:
                    assert self.line[3:].startswith(qresult.id), self.line
                    for hit, strand in self._parse_hit(query_id):
                        # HACK: re-set desc, for hsp hit and query description
                        hit.description = hit.description
                        hit.query_description = qresult.description
                        # if hit is not in qresult, append it
                        if hit.id not in qresult:
                            qresult.append(hit)
                        # otherwise, it might be the same hit with a different strand
                        else:
                            # make sure strand is different and then append hsp to
                            # existing hit
                            for hsp in hit.hsps:
                                assert strand != hsp.query_strand
                                qresult[hit.id].append(hsp)

            self.line = self.handle.readline()
Example #2
0
    def _parse_qresult(self):
        # initial qresult value
        qresult = None
        hit_rows = []
        # state values
        state_QRES_NEW = 1
        state_QRES_HITTAB = 3
        state_QRES_CONTENT = 5
        state_QRES_END = 7

        while True:

            # one line before the hit table
            if self.line.startswith('The best scores are:'):
                qres_state = state_QRES_HITTAB
            # the end of a query or the file altogether
            elif self.line.strip() == '>>>///' or not self.line:
                qres_state = state_QRES_END
            # the beginning of a new query
            elif not self.line.startswith('>>>') and '>>>' in self.line:
                qres_state = state_QRES_NEW
            # the beginning of the query info and its hits + hsps
            elif self.line.startswith('>>>') and not \
                    self.line.strip() == '>>><<<':
                qres_state = state_QRES_CONTENT
            # default qres mark
            else:
                qres_state = None

            if qres_state is not None:
                if qres_state == state_QRES_HITTAB:
                    # parse hit table if flag is set
                    hit_rows = self.__parse_hit_table()

                elif qres_state == state_QRES_END:
                    yield _set_qresult_hits(qresult, hit_rows)
                    break

                elif qres_state == state_QRES_NEW:
                    # if qresult is filled, yield it first
                    if qresult is not None:
                        yield _set_qresult_hits(qresult, hit_rows)
                    regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
                    query_id = regx.group(1)
                    seq_len = regx.group(3)
                    desc = regx.group(2)
                    qresult = QueryResult(id=query_id)
                    qresult.seq_len = int(seq_len)
                    # get target from the next line
                    self.line = self.handle.readline()
                    qresult.target = [x for x in self.line.split(' ') if x][1].strip()
                    if desc is not None:
                        qresult.description = desc
                    # set values from preamble
                    for key, value in self._preamble.items():
                        setattr(qresult, key, value)

                elif qres_state == state_QRES_CONTENT:
                    assert self.line[3:].startswith(qresult.id), self.line
                    for hit, strand in self._parse_hit(query_id):
                        # HACK: re-set desc, for hsp hit and query description
                        hit.description = hit.description
                        hit.query_description = qresult.description
                        # if hit is not in qresult, append it
                        if hit.id not in qresult:
                            qresult.append(hit)
                        # otherwise, it might be the same hit with a different strand
                        else:
                            # make sure strand is different and then append hsp to
                            # existing hit
                            for hsp in hit.hsps:
                                assert strand != hsp.query_strand
                                qresult[hit.id].append(hsp)

            self.line = self.handle.readline()
Example #3
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                                           bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Example #4
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                            bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start,
                            bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Example #5
0
class Hmmer2TextParser(object):
    """Iterator for the HMMER 2.0 text output."""

    def __init__(self, handle):
        self.handle = handle
        self.buf = []
        self._meta = self.parse_preamble()

    def __iter__(self):
        for qresult in self.parse_qresult():
            qresult.program = self._meta.get('program')
            qresult.target = self._meta.get('target')
            qresult.version = self._meta.get('version')
            yield qresult

    def read_next(self, rstrip=True):
        """Return the next non-empty line, trailing whitespace removed"""
        if len(self.buf) > 0:
            return self.buf.pop()
        self.line = self.handle.readline()
        while self.line and rstrip and not self.line.strip():
            self.line = self.handle.readline()
        if self.line:
            if rstrip:
                self.line = self.line.rstrip()
        return self.line

    def push_back(self, line):
        """Un-read a line that should not be parsed yet"""
        self.buf.append(line)

    def parse_key_value(self):
        """Parse key-value pair separated by colon (:)"""
        key, value = self.line.split(':', 1)
        return key.strip(), value.strip()

    def parse_preamble(self):
        """Parse HMMER2 preamble."""
        meta = {}
        state = "GENERIC"
        while self.read_next():
            if state == "GENERIC":
                if self.line.startswith('hmm'):
                    meta['program'] = self.line.split('-')[0].strip()
                elif self.line.startswith('HMMER is'):
                    continue
                elif self.line.startswith('HMMER'):
                    meta['version'] = self.line.split()[1]
                elif self.line.count('-') == 36:
                    state = "OPTIONS"
                continue

            assert state == "OPTIONS"
            assert 'program' in meta

            if self.line.count('-') == 32:
                break

            key, value = self.parse_key_value()
            if meta['program'] == 'hmmsearch':
                if key == 'Sequence database':
                    meta['target'] = value
                    continue
            elif meta['program'] == 'hmmpfam':
                if key == 'HMM file':
                    meta['target'] = value
                    continue
            meta[key] = value

        return meta

    def parse_qresult(self):
        """Parse a HMMER2 query block."""
        while self.read_next():
            if not self.line.startswith('Query'):
                raise StopIteration()
            _, id_ = self.parse_key_value()
            self.qresult = QueryResult(id=id_)

            description = None

            while self.read_next() and not self.line.startswith('Scores'):
                if self.line.startswith('Accession'):
                    self.qresult.accession = self.parse_key_value()[1]
                if self.line.startswith('Description'):
                    description = self.parse_key_value()[1]

            hit_placeholders = self.parse_hits()
            if len(hit_placeholders) > 0:
                self.parse_hsps(hit_placeholders)
                self.parse_hsp_alignments()

            while not self.line.startswith('Query'):
                self.read_next()
                if not self.line:
                    break
            self.buf.append(self.line)

            if description is not None:
                self.qresult.description = description
            yield self.qresult

    def parse_hits(self):
        """Parse a HMMER2 hit block, beginning with the hit table."""

        hit_placeholders = []
        while self.read_next():
            if self.line.startswith('Parsed'):
                break
            if self.line.find('no hits') > -1:
                break

            if self.line.startswith('Sequence') or \
               self.line.startswith('Model') or \
               self.line.startswith('-------- '):
                continue

            fields = self.line.split()
            id_ = fields.pop(0)
            domain_obs_num = int(fields.pop())
            evalue = float(fields.pop())
            bitscore = float(fields.pop())
            description = ' '.join(fields).strip()


            hit = _HitPlaceholder()
            hit.id_ = id_
            hit.evalue = evalue
            hit.bitscore = bitscore
            hit.description = description
            hit.domain_obs_num = domain_obs_num
            hit_placeholders.append(hit)

        return hit_placeholders

    def parse_hsps(self, hit_placeholders):
        """Parse a HMMER2 hsp block, beginning with the hsp table."""
        # HSPs may occur in different order than the hits
        # so store Hit objects separately first
        unordered_hits = {}
        while self.read_next():
            if self.line.startswith('Alignments') or \
               self.line.startswith('Histogram') or \
               self.line == '//':
                break
            if self.line.startswith('Model') or \
               self.line.startswith('Sequence') or \
               self.line.startswith('--------'):
                continue

            id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \
            score, evalue = self.line.split()

            frag = HSPFragment(id_, self.qresult.id)
            frag.alphabet = generic_protein
            if self._meta['program'] == 'hmmpfam':
                frag.hit_start = int(hmm_f) - 1
                frag.hit_end = int(hmm_t)
                frag.query_start = int(seq_f) - 1
                frag.query_end = int(seq_t)
            elif self._meta['program'] == 'hmmsearch':
                frag.query_start = int(hmm_f) - 1
                frag.query_end = int(hmm_t)
                frag.hit_start = int(seq_f) - 1
                frag.hit_end = int(seq_t)

            hsp = HSP([frag])
            hsp.evalue = float(evalue)
            hsp.bitscore = float(score)
            hsp.domain_index = int(domain.split('/')[0])
            if self._meta['program'] == 'hmmpfam':
                hsp.hit_endtype = hmm_compl
                hsp.query_endtype = seq_compl
            elif self._meta['program'] == 'hmmsearch':
                hsp.query_endtype = hmm_compl
                hsp.hit_endtype = seq_compl

            if id_ not in unordered_hits:
                placeholder = [ p for p in hit_placeholders if p.id_ == id_][0]
                hit = placeholder.createHit([hsp])
                unordered_hits[id_] = hit
            else:
                hit = unordered_hits[id_]
                hsp.hit_description = hit.description
                hit.append(hsp)

        # The placeholder list is in the correct order, so use that order for
        # the Hit objects in the qresult
        for p in hit_placeholders:
            self.qresult.append(unordered_hits[p.id_])

    def parse_hsp_alignments(self):
        """Parse a HMMER2 HSP alignment block."""
        if not self.line.startswith('Alignments'):
            return

        while self.read_next():
            if self.line == '//' or self.line.startswith('Histogram'):
                break

            match = re.search(_HSP_ALIGN_LINE, self.line)
            if match is None:
                continue

            id_ = match.group(1)
            idx = int(match.group(2))
            num = int(match.group(3))

            hit = self.qresult[id_]
            if hit.domain_obs_num != num:
                continue

            frag = hit[idx-1][0]

            hmmseq = ''
            consensus = ''
            otherseq = ''
            structureseq = ''
            pad = 0
            while self.read_next() and self.line.startswith(' '):
                # if there's structure information, parse that
                if self.line[16:18] == 'CS':
                    structureseq += self.line[19:].strip()

                    if not self.read_next():
                        break

                # skip the *-> start marker if it exists
                if self.line[19] == '*':
                    seq = self.line[22:]
                    pad = 3
                else:
                    seq = self.line[19:]
                    pad = 0

                # get rid of the end marker
                if seq.endswith('<-*'):
                    seq = seq[:-3]

                hmmseq += seq
                line_len = len(seq)
                if not self.read_next(rstrip=False):
                    break
                consensus += self.line[19+pad:19+pad+line_len]
                # If there's no consensus sequence, hmmer2 doesn't
                # bother to put spaces here, so add extra padding
                extra_padding = len(hmmseq) - len(consensus)
                consensus += ' ' * extra_padding

                if not self.read_next():
                    break
                otherseq += self.line[19:].split()[0].strip()

            self.push_back(self.line)

            # add similarity sequence to annotation
            frag.aln_annotation['similarity'] = consensus

            # if there's structure information, add it to the fragment
            if structureseq:
                frag.aln_annotation['CS'] = structureseq

            if self._meta['program'] == 'hmmpfam':
                frag.hit = hmmseq
                frag.query = otherseq
            else:
                frag.hit = otherseq
                frag.query = hmmseq