Beispiel #1
0
    def parse_qresult(self):
        """Parse a HMMER2 query block."""
        while self.read_next():
            if not self.line.startswith('Query'):
                return
            _, id_ = self.parse_key_value()
            self.qresult = QueryResult(id=id_)

            description = None

            while self.read_next() and not self.line.startswith('Scores'):
                if self.line.startswith('Accession'):
                    self.qresult.accession = self.parse_key_value()[1]
                if self.line.startswith('Description'):
                    description = self.parse_key_value()[1]

            hit_placeholders = self.parse_hits()
            if len(hit_placeholders) > 0:
                self.parse_hsps(hit_placeholders)
                self.parse_hsp_alignments()

            while not self.line.startswith('Query'):
                self.read_next()
                if not self.line:
                    break
            self.buf.append(self.line)

            if description is not None:
                self.qresult.description = description
            yield self.qresult
Beispiel #2
0
    def _parse_qresult(self):
        """Parses a HMMER3 query block."""

        self._read_until(lambda line: line.startswith('Query:'))

        while self.line:

            # get query id and length
            regx = re.search(_QRE_ID_LEN, self.line)
            qid = regx.group(1).strip()
            # store qresult attributes
            qresult_attrs = {
                'seq_len': int(regx.group(2)),
                'program': self._meta.get('program'),
                'version': self._meta.get('version'),
                'target': self._meta.get('target'),
            }

            # get description and accession, if they exist
            desc = '' # placeholder
            while not self.line.startswith('Scores for '):
                self.line = read_forward(self.handle)

                if self.line.startswith('Accession:'):
                    acc = self.line.strip().split(' ', 1)[1]
                    qresult_attrs['accession'] = acc.strip()
                elif self.line.startswith('Description:'):
                    desc = self.line.strip().split(' ', 1)[1]
                    qresult_attrs['description'] = desc.strip()

            # parse the query hits
            while self.line and '//' not in self.line:
                hit_list = self._parse_hit(qid)
                # read through the statistics summary
                # TODO: parse and store this information?
                if self.line.startswith('Internal pipeline'):
                    while self.line and '//' not in self.line:
                        self.line = read_forward(self.handle)

            # create qresult, set its attributes and yield
            # not initializing hit_list directly to handle empty hits
            # (i.e. need to set its query description manually)
            qresult = QueryResult(id=qid)
            for hit in hit_list:
                if not hit:
                    hit.query_description = qresult.description
                qresult.append(hit)
            for attr, value in qresult_attrs.items():
                setattr(qresult, attr, value)
            yield qresult
            self.line = read_forward(self.handle)
Beispiel #3
0
    def _parse_qresult(self):
        """Parses a HMMER3 query block."""

        self._read_until(lambda line: line.startswith('Query:'))

        while self.line:

            # get query id and length
            regx = re.search(_QRE_ID_LEN, self.line)
            qid = regx.group(1).strip()
            # store qresult attributes
            qresult_attrs = {
                'seq_len': int(regx.group(2)),
                'program': self._meta.get('program'),
                'version': self._meta.get('version'),
                'target': self._meta.get('target'),
            }

            # get description and accession, if they exist
            desc = ''  # placeholder
            while not self.line.startswith('Scores for '):
                self.line = read_forward(self.handle)

                if self.line.startswith('Accession:'):
                    acc = self.line.strip().split(' ', 1)[1]
                    qresult_attrs['accession'] = acc.strip()
                elif self.line.startswith('Description:'):
                    desc = self.line.strip().split(' ', 1)[1]
                    qresult_attrs['description'] = desc.strip()

            # parse the query hits
            while self.line and '//' not in self.line:
                hit_list = self._parse_hit(qid)
                # read through the statistics summary
                # TODO: parse and store this information?
                if self.line.startswith('Internal pipeline'):
                    while self.line and '//' not in self.line:
                        self.line = read_forward(self.handle)

            # create qresult, set its attributes and yield
            # not initializing hit_list directly to handle empty hits
            # (i.e. need to set its query description manually)
            qresult = QueryResult(id=qid)
            for hit in hit_list:
                if not hit:
                    hit.query_description = qresult.description
                qresult.append(hit)
            for attr, value in qresult_attrs.items():
                setattr(qresult, attr, value)
            yield qresult
            self.line = read_forward(self.handle)
Beispiel #4
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block['hit_id']

            frag = HSPFragment(hit_id, query_id)
            # frag.alphabet = generic_protein
            if block['query_start']:
                frag.query_start = block['query_start'] - 1
            else:
                frag.query_start = block['query_start']
            frag.query_end = block['query_end']
            if block['hit_start']:
                frag.hit_start = block['hit_start'] - 1
            else:
                frag.hit_start = block['hit_start']
            frag.hit_end = block['hit_end']
            frag.hit = block['hit_seq']
            frag.query = block['query_seq']

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block['description']
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block['evalue']
            hsp.score = block['score']
            hsp.prob = block['prob']
            hsp.hit_seq_len = block['hit_seq_len']
            hsp.text = block['text']

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block['description']
                hit.is_included = is_included
                hit.evalue = block['evalue']
                hit.score = block['score']
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Beispiel #5
0
    def parse_qresult(self):
        """Parse a HMMER2 query block."""
        while self.read_next():
            if not self.line.startswith('Query'):
                raise StopIteration()
            _, id_ = self.parse_key_value()
            self.qresult = QueryResult(id=id_)

            description = None

            while self.read_next() and not self.line.startswith('Scores'):
                if self.line.startswith('Accession'):
                    self.qresult.accession = self.parse_key_value()[1]
                if self.line.startswith('Description'):
                    description = self.parse_key_value()[1]

            hit_placeholders = self.parse_hits()
            if len(hit_placeholders) > 0:
                self.parse_hsps(hit_placeholders)
                self.parse_hsp_alignments()

            while not self.line.startswith('Query'):
                self.read_next()
                if not self.line:
                    break
            self.buf.append(self.line)

            if description is not None:
                self.qresult.description = description
            yield self.qresult
Beispiel #6
0
    def _parse_qresult(self):
        """Parse query results (PRIVATE)."""
        for event, elem in self.xml_iter:
            if event == "end" and elem.tag == self.NS + "protein":
                # store the query sequence
                seq = elem.find(self.NS + "sequence")
                query_seq = seq.text

                # store the query id and description
                xref = elem.find(self.NS + "xref")
                query_id = xref.attrib["id"]
                query_desc = xref.attrib["name"]

                # parse each hit
                hit_list = []
                for hit_new in self._parse_hit(
                        elem.find(self.NS + "matches"), query_id, query_seq):
                    # interproscan results contain duplicate hits rather than
                    # a single hit with multiple hsps. In this case the hsps
                    # of a duplicate hit will be appended to the already
                    # existing hit
                    for hit in hit_list:
                        if hit.id == hit_new.id:
                            for hsp in hit_new.hsps:
                                hit.hsps.append(hsp)
                            break
                    else:
                        hit_list.append(hit_new)

                # create qresult and assing attributes
                qresult = QueryResult(hit_list, query_id)
                setattr(qresult, "description", query_desc)
                for key, value in self._meta.items():
                    setattr(qresult, key, value)
                yield qresult
Beispiel #7
0
    def _parse_qresult(self):
        """Parses a HMMER3 query block."""

        self._read_until(lambda line: line.startswith('Query:'))

        while self.line:

            # get query id and length
            regx = re.search(_QRE_ID_LEN, self.line)
            qid = regx.group(1).strip()
            # store qresult attributes
            qresult_attrs = {
                'seq_len': int(regx.group(2)),
                'program': self._meta.get('program'),
                'version': self._meta.get('version'),
                'target': self._meta.get('target'),
            }

            # get description and accession, if they exist
            qdesc = '<unknown description>'  # placeholder
            while not self.line.startswith('Scores for '):
                self.line = read_forward(self.handle)

                if self.line.startswith('Accession:'):
                    acc = self.line.strip().split(' ', 1)[1]
                    qresult_attrs['accession'] = acc.strip()
                elif self.line.startswith('Description:'):
                    qdesc = self.line.strip().split(' ', 1)[1].strip()
                    qresult_attrs['description'] = qdesc

            # parse the query hits
            while self.line and '//' not in self.line:
                hit_list = self._parse_hit(qid, qdesc)
                # read through the statistics summary
                # TODO: parse and store this information?
                if self.line.startswith('Internal pipeline'):
                    while self.line and '//' not in self.line:
                        self.line = read_forward(self.handle)

            # create qresult, set its attributes and yield
            # not initializing hit_list directly to handle empty hits
            # (i.e. need to set its query description manually)
            qresult = QueryResult(id=qid, hits=hit_list)
            for attr, value in qresult_attrs.items():
                setattr(qresult, attr, value)
            yield qresult
            self.line = read_forward(self.handle)

            # Skip line beginning with '# Alignment of', which are output
            # when running phmmer with the '-A' flag.
            if self.line.startswith('# Alignment of'):
                self.line = self.handle.readline()

            # HMMER >= 3.1 outputs '[ok]' at the end of all results file,
            # which means we can break the main loop when we see the line
            if '[ok]' in self.line:
                break
Beispiel #8
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block['hit_id']

            frag = HSPFragment(hit_id, query_id)
            frag.alphabet = generic_protein
            frag.query_start = block['query_start'] - 1
            frag.query_end = block['query_end']
            frag.hit_start = block['hit_start'] - 1
            frag.hit_end = block['hit_end']
            frag.hit = block['hit_seq']
            frag.query = block['query_seq']

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block['description']
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block['evalue']
            hsp.score = block['score']
            hsp.prob = block['prob']

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block['description']
                hit.is_included = is_included
                hit.evalue = block['evalue']
                hit.score = block['score']
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Beispiel #9
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block["hit_id"]

            frag = HSPFragment(hit_id, query_id)
            frag.molecule_type = "protein"
            frag.query_start = block["query_start"] - 1
            frag.query_end = block["query_end"]
            frag.hit_start = block["hit_start"] - 1
            frag.hit_end = block["hit_end"]
            frag.hit = block["hit_seq"]
            frag.query = block["query_seq"]

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block["description"]
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block["evalue"]
            hsp.score = block["score"]
            hsp.prob = block["prob"]

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block["description"]
                hit.is_included = is_included
                hit.evalue = block["evalue"]
                hit.score = block["score"]
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Beispiel #10
0
    def _parse_commented_qresult(self):
        """Iterator returning `QueryResult` objects from a commented file."""
        while True:
            comments = self._parse_comments()
            if comments:
                try:
                    self.fields = comments['fields']
                    # iterator for the query results
                    qres_iter = self._parse_qresult()
                except KeyError:
                    # no fields means the query has no results
                    assert 'fields' not in comments
                    # create an iterator returning one empty qresult
                    # if the query has no results
                    qres_iter = iter([QueryResult('')])

                for qresult in qres_iter:
                    for key, value in comments.items():
                        setattr(qresult, key, value)
                    yield qresult

            else: break
Beispiel #11
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummy values
        qres_state = None
        file_state = None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur['qname']
                cur_hid = cur['tname']
            else:
                file_state = state_EOF
                # mock values, since we have nothing to parse
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                # create fragment and HSP and set their attributes
                hsp = _create_hsp(prev_hid, prev_qid, prev)
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    # create Hit and set its attributes
                    hit = Hit(hsp_list)
                    hit.seq_len = prev['tsize']
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(prev_qid)
                    for hit in hit_list:
                        qresult.absorb(hit)
                    qresult.seq_len = prev['qsize']
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Beispiel #12
0
    def _parse_qresult(self):
        # initial qresult value
        qresult = None
        hit_rows = []
        # state values
        state_QRES_NEW = 1
        state_QRES_HITTAB = 3
        state_QRES_CONTENT = 5
        state_QRES_END = 7

        while True:

            # one line before the hit table
            if self.line.startswith('The best scores are:'):
                qres_state = state_QRES_HITTAB
            # the end of a query or the file altogether
            elif self.line.strip() == '>>>///' or not self.line:
                qres_state = state_QRES_END
            # the beginning of a new query
            elif not self.line.startswith('>>>') and '>>>' in self.line:
                qres_state = state_QRES_NEW
            # the beginning of the query info and its hits + hsps
            elif self.line.startswith('>>>') and not \
                    self.line.strip() == '>>><<<':
                qres_state = state_QRES_CONTENT
            # default qres mark
            else:
                qres_state = None

            if qres_state is not None:
                if qres_state == state_QRES_HITTAB:
                    # parse hit table if flag is set
                    hit_rows = self.__parse_hit_table()

                elif qres_state == state_QRES_END:
                    yield _set_qresult_hits(qresult, hit_rows)
                    break

                elif qres_state == state_QRES_NEW:
                    # if qresult is filled, yield it first
                    if qresult is not None:
                        yield _set_qresult_hits(qresult, hit_rows)
                    regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
                    query_id = regx.group(1)
                    seq_len = regx.group(3)
                    desc = regx.group(2)
                    qresult = QueryResult(id=query_id)
                    qresult.seq_len = int(seq_len)
                    # get target from the next line
                    self.line = self.handle.readline()
                    qresult.target = [x for x in self.line.split(' ') if x][1].strip()
                    if desc is not None:
                        qresult.description = desc
                    # set values from preamble
                    for key, value in self._preamble.items():
                        setattr(qresult, key, value)

                elif qres_state == state_QRES_CONTENT:
                    assert self.line[3:].startswith(qresult.id), self.line
                    for hit, strand in self._parse_hit(query_id):
                        # HACK: re-set desc, for hsp hit and query description
                        hit.description = hit.description
                        hit.query_description = qresult.description
                        # if hit is not in qresult, append it
                        if hit.id not in qresult:
                            qresult.append(hit)
                        # otherwise, it might be the same hit with a different strand
                        else:
                            # make sure strand is different and then append hsp to
                            # existing hit
                            for hsp in hit.hsps:
                                assert strand != hsp.query_strand
                                qresult[hit.id].append(hsp)

            self.line = self.handle.readline()
Beispiel #13
0
    def _parse_qresult(self):
        # state values
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummies
        qres_state, hit_state = None, None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []
        # if the file has c4 alignments, use that as the alignment mark
        if self.has_c4_alignment:
            self._ALN_MARK = 'C4 Alignment:'

        while True:
            self.read_until(lambda line: line.startswith(self._ALN_MARK))
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                assert self.line.startswith(self._ALN_MARK), self.line
                # create temp dicts for storing parsed values
                header = {'qresult': {}, 'hit': {}, 'hsp': {}}
                # if the file has c4 alignments, try to parse the header
                if self.has_c4_alignment:
                    self.read_until(lambda line:
                                    line.strip().startswith('Query:'))
                    header = self._parse_alignment_header()
                # parse the block contents
                cur = self.parse_alignment_block(header)
                cur_qid = cur['qresult']['id']
                cur_hid = cur['hit']['id']
            elif not self.line or self.line.startswith('-- completed '):
                file_state = state_EOF
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new query
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                hsp = _create_hsp(prev_hid, prev_qid, prev['hsp'])
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        # not using append since Exonerate may separate the
                        # same hit if it has different strands
                        qresult.absorb(hit)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    if file_state == state_EOF:
                        break
                    hit_list = []

            # only readline() here if we're not parsing C4 alignments
            # C4 alignments readline() is handled by its parse_alignment_block
            # function
            if not self.has_c4_alignment:
                self.line = self.handle.readline()
Beispiel #14
0
class Hmmer2TextParser(object):
    """Iterator for the HMMER 2.0 text output."""

    def __init__(self, handle):
        self.handle = handle
        self.buf = []
        self._meta = self.parse_preamble()

    def __iter__(self):
        for qresult in self.parse_qresult():
            qresult.program = self._meta.get('program')
            qresult.target = self._meta.get('target')
            qresult.version = self._meta.get('version')
            yield qresult

    def read_next(self, rstrip=True):
        """Return the next non-empty line, trailing whitespace removed"""
        if len(self.buf) > 0:
            return self.buf.pop()
        self.line = self.handle.readline()
        while self.line and rstrip and not self.line.strip():
            self.line = self.handle.readline()
        if self.line:
            if rstrip:
                self.line = self.line.rstrip()
        return self.line

    def push_back(self, line):
        """Un-read a line that should not be parsed yet"""
        self.buf.append(line)

    def parse_key_value(self):
        """Parse key-value pair separated by colon (:)"""
        key, value = self.line.split(':', 1)
        return key.strip(), value.strip()

    def parse_preamble(self):
        """Parse HMMER2 preamble."""
        meta = {}
        state = "GENERIC"
        while self.read_next():
            if state == "GENERIC":
                if self.line.startswith('hmm'):
                    meta['program'] = self.line.split('-')[0].strip()
                elif self.line.startswith('HMMER is'):
                    continue
                elif self.line.startswith('HMMER'):
                    meta['version'] = self.line.split()[1]
                elif self.line.count('-') == 36:
                    state = "OPTIONS"
                continue

            assert state == "OPTIONS"
            assert 'program' in meta

            if self.line.count('-') == 32:
                break

            key, value = self.parse_key_value()
            if meta['program'] == 'hmmsearch':
                if key == 'Sequence database':
                    meta['target'] = value
                    continue
            elif meta['program'] == 'hmmpfam':
                if key == 'HMM file':
                    meta['target'] = value
                    continue
            meta[key] = value

        return meta

    def parse_qresult(self):
        """Parse a HMMER2 query block."""
        while self.read_next():
            if not self.line.startswith('Query'):
                raise StopIteration()
            _, id_ = self.parse_key_value()
            self.qresult = QueryResult(id=id_)

            description = None

            while self.read_next() and not self.line.startswith('Scores'):
                if self.line.startswith('Accession'):
                    self.qresult.accession = self.parse_key_value()[1]
                if self.line.startswith('Description'):
                    description = self.parse_key_value()[1]

            hit_placeholders = self.parse_hits()
            if len(hit_placeholders) > 0:
                self.parse_hsps(hit_placeholders)
                self.parse_hsp_alignments()

            while not self.line.startswith('Query'):
                self.read_next()
                if not self.line:
                    break
            self.buf.append(self.line)

            if description is not None:
                self.qresult.description = description
            yield self.qresult

    def parse_hits(self):
        """Parse a HMMER2 hit block, beginning with the hit table."""

        hit_placeholders = []
        while self.read_next():
            if self.line.startswith('Parsed'):
                break
            if self.line.find('no hits') > -1:
                break

            if self.line.startswith('Sequence') or \
               self.line.startswith('Model') or \
               self.line.startswith('-------- '):
                continue

            fields = self.line.split()
            id_ = fields.pop(0)
            domain_obs_num = int(fields.pop())
            evalue = float(fields.pop())
            bitscore = float(fields.pop())
            description = ' '.join(fields).strip()


            hit = _HitPlaceholder()
            hit.id_ = id_
            hit.evalue = evalue
            hit.bitscore = bitscore
            hit.description = description
            hit.domain_obs_num = domain_obs_num
            hit_placeholders.append(hit)

        return hit_placeholders

    def parse_hsps(self, hit_placeholders):
        """Parse a HMMER2 hsp block, beginning with the hsp table."""
        # HSPs may occur in different order than the hits
        # so store Hit objects separately first
        unordered_hits = {}
        while self.read_next():
            if self.line.startswith('Alignments') or \
               self.line.startswith('Histogram') or \
               self.line == '//':
                break
            if self.line.startswith('Model') or \
               self.line.startswith('Sequence') or \
               self.line.startswith('--------'):
                continue

            id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \
            score, evalue = self.line.split()

            frag = HSPFragment(id_, self.qresult.id)
            frag.alphabet = generic_protein
            if self._meta['program'] == 'hmmpfam':
                frag.hit_start = int(hmm_f) - 1
                frag.hit_end = int(hmm_t)
                frag.query_start = int(seq_f) - 1
                frag.query_end = int(seq_t)
            elif self._meta['program'] == 'hmmsearch':
                frag.query_start = int(hmm_f) - 1
                frag.query_end = int(hmm_t)
                frag.hit_start = int(seq_f) - 1
                frag.hit_end = int(seq_t)

            hsp = HSP([frag])
            hsp.evalue = float(evalue)
            hsp.bitscore = float(score)
            hsp.domain_index = int(domain.split('/')[0])
            if self._meta['program'] == 'hmmpfam':
                hsp.hit_endtype = hmm_compl
                hsp.query_endtype = seq_compl
            elif self._meta['program'] == 'hmmsearch':
                hsp.query_endtype = hmm_compl
                hsp.hit_endtype = seq_compl

            if id_ not in unordered_hits:
                placeholder = [ p for p in hit_placeholders if p.id_ == id_][0]
                hit = placeholder.createHit([hsp])
                unordered_hits[id_] = hit
            else:
                hit = unordered_hits[id_]
                hsp.hit_description = hit.description
                hit.append(hsp)

        # The placeholder list is in the correct order, so use that order for
        # the Hit objects in the qresult
        for p in hit_placeholders:
            self.qresult.append(unordered_hits[p.id_])

    def parse_hsp_alignments(self):
        """Parse a HMMER2 HSP alignment block."""
        if not self.line.startswith('Alignments'):
            return

        while self.read_next():
            if self.line == '//' or self.line.startswith('Histogram'):
                break

            match = re.search(_HSP_ALIGN_LINE, self.line)
            if match is None:
                continue

            id_ = match.group(1)
            idx = int(match.group(2))
            num = int(match.group(3))

            hit = self.qresult[id_]
            if hit.domain_obs_num != num:
                continue

            frag = hit[idx-1][0]

            hmmseq = ''
            consensus = ''
            otherseq = ''
            structureseq = ''
            pad = 0
            while self.read_next() and self.line.startswith(' '):
                # if there's structure information, parse that
                if self.line[16:18] == 'CS':
                    structureseq += self.line[19:].strip()

                    if not self.read_next():
                        break

                # skip the *-> start marker if it exists
                if self.line[19] == '*':
                    seq = self.line[22:]
                    pad = 3
                else:
                    seq = self.line[19:]
                    pad = 0

                # get rid of the end marker
                if seq.endswith('<-*'):
                    seq = seq[:-3]

                hmmseq += seq
                line_len = len(seq)
                if not self.read_next(rstrip=False):
                    break
                consensus += self.line[19+pad:19+pad+line_len]
                # If there's no consensus sequence, hmmer2 doesn't
                # bother to put spaces here, so add extra padding
                extra_padding = len(hmmseq) - len(consensus)
                consensus += ' ' * extra_padding

                if not self.read_next():
                    break
                otherseq += self.line[19:].split()[0].strip()

            self.push_back(self.line)

            # add homology sequence to annotation
            frag.aln_annotation['homology'] = consensus

            # if there's structure information, add it to the fragment
            if structureseq:
                frag.aln_annotation['CS'] = structureseq

            if self._meta['program'] == 'hmmpfam':
                frag.hit = hmmseq
                frag.query = otherseq
            else:
                frag.hit = otherseq
                frag.query = hmmseq
Beispiel #15
0
    def _parse_qresult(self):
        """Parse query result (PRIVATE)."""
        # initial qresult value
        qresult = None
        hit_rows = []
        # state values
        state_QRES_NEW = 1
        state_QRES_HITTAB = 3
        state_QRES_CONTENT = 5
        state_QRES_END = 7

        while True:

            # one line before the hit table
            if self.line.startswith("The best scores are:"):
                qres_state = state_QRES_HITTAB
            # the end of a query or the file altogether
            elif self.line.strip() == ">>>///" or not self.line:
                qres_state = state_QRES_END
            # the beginning of a new query
            elif not self.line.startswith(">>>") and ">>>" in self.line:
                qres_state = state_QRES_NEW
            # the beginning of the query info and its hits + hsps
            elif self.line.startswith(">>>") and not \
                    self.line.strip() == ">>><<<":
                qres_state = state_QRES_CONTENT
            # default qres mark
            else:
                qres_state = None

            if qres_state is not None:
                if qres_state == state_QRES_HITTAB:
                    # parse hit table if flag is set
                    hit_rows = self.__parse_hit_table()

                elif qres_state == state_QRES_END:
                    yield _set_qresult_hits(qresult, hit_rows)
                    break

                elif qres_state == state_QRES_NEW:
                    # if qresult is filled, yield it first
                    if qresult is not None:
                        yield _set_qresult_hits(qresult, hit_rows)
                    regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
                    query_id = regx.group(1)
                    seq_len = regx.group(3)
                    desc = regx.group(2)
                    qresult = QueryResult(id=query_id)
                    qresult.seq_len = int(seq_len)
                    # get target from the next line
                    self.line = self.handle.readline()
                    qresult.target = [x for x in self.line.split(" ")
                                      if x][1].strip()
                    if desc is not None:
                        qresult.description = desc
                    # set values from preamble
                    for key, value in self._preamble.items():
                        setattr(qresult, key, value)

                elif qres_state == state_QRES_CONTENT:
                    assert self.line[3:].startswith(qresult.id), self.line
                    for hit, strand in self._parse_hit(query_id):
                        # HACK: re-set desc, for hsp hit and query description
                        hit.description = hit.description
                        hit.query_description = qresult.description
                        # if hit is not in qresult, append it
                        if hit.id not in qresult:
                            qresult.append(hit)
                        # otherwise, it might be the same hit with a different strand
                        else:
                            # make sure strand is different and then append hsp to
                            # existing hit
                            for hsp in hit.hsps:
                                assert strand != hsp.query_strand
                                qresult[hit.id].append(hsp)

            self.line = self.handle.readline()
Beispiel #16
0
    def _parse_qresult(self):
        """Return QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        # initial value dummies
        qres_state = None
        file_state = None
        prev_qid = None
        cur, prev = None, None
        # container for Hit objects, used to create QueryResult
        hit_list = []
        cur_qid = None
        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
            # only parse the result row if it's not EOF
            # NOTE: we are not parsing the extra '#' lines appended to the end
            # of hmmer31b1 tabular results since storing them in qresult
            # objects means we can not do a single-pass parsing
            if self.line and not self.line.startswith('#'):
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
            else:
                file_state = state_EOF
                # mock value for cur_qid, since we have nothing to parse
                cur_qid = None

            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME

            if prev is not None:
                # since domain tab formats only have 1 Hit per line
                # we always create HSPFragment, HSP, and Hit per line
                prev_hid = prev['hit']['id']

                # create fragment and HSP and set their attributes
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)

                # create Hit and set its attributes
                hit = Hit([hsp])
                for attr, value in prev['hit'].items():
                    setattr(hit, attr, value)
                hit_list.append(hit)

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Beispiel #17
0
    def _parse_qresult(self):
        # state values
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummies
        qres_state, hit_state = None, None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []
        # if the file has c4 alignments, use that as the alignment mark
        if self.has_c4_alignment:
            self._ALN_MARK = 'C4 Alignment:'

        while True:
            self.read_until(lambda line: line.startswith(self._ALN_MARK))
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                assert self.line.startswith(self._ALN_MARK), self.line
                # create temp dicts for storing parsed values
                header = {'qresult': {}, 'hit': {}, 'hsp': {}}
                # if the file has c4 alignments, try to parse the header
                if self.has_c4_alignment:
                    self.read_until(
                        lambda line: line.strip().startswith('Query:'))
                    header = self._parse_alignment_header()
                # parse the block contents
                cur = self.parse_alignment_block(header)
                cur_qid = cur['qresult']['id']
                cur_hid = cur['hit']['id']
            elif not self.line or self.line.startswith('-- completed '):
                file_state = state_EOF
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new query
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                hsp = _create_hsp(prev_hid, prev_qid, prev['hsp'])
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        # not using append since Exonerate may separate the
                        # same hit if it has different strands
                        qresult.absorb(hit)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    if file_state == state_EOF:
                        break
                    hit_list = []

            # only readline() here if we're not parsing C4 alignments
            # C4 alignments readline() is handled by its parse_alignment_block
            # function
            if not self.has_c4_alignment:
                self.line = self.handle.readline()
Beispiel #18
0
    def _parse_qresult(self):
        """Yield QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummy values
        qres_state = None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur["qname"]
                cur_hid = cur["tname"]
            else:
                file_state = state_EOF
                # mock values, since we have nothing to parse
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                # create fragment and HSP and set their attributes
                hsp = _create_hsp(prev_hid, prev_qid, prev)
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    # create Hit and set its attributes
                    hit = Hit(hsp_list)
                    hit.seq_len = prev["tsize"]
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        qresult.absorb(hit)
                    qresult.seq_len = prev["qsize"]
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Beispiel #19
0
    def _parse_qresult(self):
        """Parse a HMMER3 query block (PRIVATE)."""
        self._read_until(lambda line: line.startswith("Query:"))

        while self.line:

            regx = re.search(_QRE_ID_LEN, self.line)

            while not regx:
                self.line = read_forward(self.handle)
                regx = re.search(_QRE_ID_LEN, self.line)

            # get query id and length
            qid = regx.group(1).strip()
            # store qresult attributes
            qresult_attrs = {
                "seq_len": int(regx.group(2)),
                "program": self._meta.get("program"),
                "version": self._meta.get("version"),
                "target": self._meta.get("target"),
            }

            # get description and accession, if they exist
            qdesc = "<unknown description>"  # placeholder
            while not self.line.startswith("Scores for "):
                self.line = read_forward(self.handle)

                if self.line.startswith("Accession:"):
                    acc = self.line.strip().split(" ", 1)[1]
                    qresult_attrs["accession"] = acc.strip()
                elif self.line.startswith("Description:"):
                    qdesc = self.line.strip().split(" ", 1)[1].strip()
                    qresult_attrs["description"] = qdesc

            # parse the query hits
            while self.line and "//" not in self.line:
                hit_list = self._parse_hit(qid, qdesc)
                # read through the statistics summary
                # TODO: parse and store this information?
                if self.line.startswith("Internal pipeline"):
                    while self.line and "//" not in self.line:
                        self.line = read_forward(self.handle)

            # create qresult, set its attributes and yield
            # not initializing hit_list directly to handle empty hits
            # (i.e. need to set its query description manually)
            qresult = QueryResult(id=qid, hits=hit_list)
            for attr, value in qresult_attrs.items():
                setattr(qresult, attr, value)
            yield qresult
            self.line = read_forward(self.handle)

            # Skip line beginning with '# Alignment of', which are output
            # when running phmmer with the '-A' flag.
            if self.line.startswith("#"):
                self.line = self.handle.readline()

            # HMMER >= 3.1 outputs '[ok]' at the end of all results file,
            # which means we can break the main loop when we see the line
            if "[ok]" in self.line:
                break
Beispiel #20
0
    def __iter__(self):
        """Iterate over BlastTextParser, yields query results."""
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith(">"):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(" ", 1)
            except ValueError:
                qid, qdesc = rec.query, ""
            qdesc = qdesc.replace("\n", "").replace("\r", "")

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine molecule_type based on program
            if qresult.program == "blastn":
                molecule_type = "DNA"
            elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]:
                molecule_type = "protein"

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith("> "):
                    aln.title = aln.title[2:]
                elif aln.title.startswith(">"):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(" ", 1)
                except ValueError:
                    hid, hdesc = aln.title, ""
                hdesc = hdesc.replace("\n", "").replace("\r", "")

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.molecule_type = molecule_type
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ""
                    hseq = ""
                    midline = ""
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == " " or hchar == " ":
                            assert all(" " == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation["similarity"] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Beispiel #21
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        # initial value dummies
        qres_state = None
        file_state = None
        prev_qid = None
        cur, prev = None, None
        # container for Hit objects, used to create QueryResult
        hit_list = []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
            else:
                file_state = state_EOF
                # mock value for cur_qid, since we have nothing to parse
                cur_qid = None

            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME

            if prev is not None:
                # since domain tab formats only have 1 Hit per line
                # we always create HSPFragment, HSP, and Hit per line
                prev_hid = prev['hit']['id']

                # create fragment and HSP and set their attributes
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)

                # create Hit and set its attributes
                hit = Hit([hsp])
                for attr, value in prev['hit'].items():
                    setattr(hit, attr, value)
                hit_list.append(hit)

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(prev_qid, hits=hit_list)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Beispiel #22
0
    def _parse_qresult(self):
        """Parses query results."""
        # parse the queries
        for event, qresult_elem in self.xml_iter:
            # </Iteration> marks the end of a single query
            # which means we can process it
            if event == 'end' and qresult_elem.tag == 'Iteration':

                # we'll use the following schema
                # <!ELEMENT Iteration (
                #        Iteration_iter-num,
                #        Iteration_query-ID?,
                #        Iteration_query-def?,
                #        Iteration_query-len?,
                #        Iteration_hits?,
                #        Iteration_stat?,
                #        Iteration_message?)>

                # assign query attributes with fallbacks
                query_id = qresult_elem.findtext('Iteration_query-ID')
                if query_id is None:
                    query_id = self._fallback['id']

                query_desc = qresult_elem.findtext('Iteration_query-def')
                if query_desc is None:
                    query_desc = self._fallback['description']

                query_len = qresult_elem.findtext('Iteration_query-len')
                if query_len is None:
                    query_len = self._fallback['len']

                # handle blast searches against databases with Blast's IDs
                # 'Query_' marks the beginning of a BLAST+-generated ID,
                # 'lcl|' marks the beginning of a BLAST legacy-generated ID
                if query_id.startswith('Query_') or query_id.startswith('lcl|'):
                    # store the Blast-generated query ID
                    blast_query_id = query_id
                    id_desc = query_desc.split(' ', 1)
                    query_id = id_desc[0]
                    try:
                        query_desc = id_desc[1]
                    except IndexError:
                        query_desc = ''
                else:
                    blast_query_id = ''

                hit_list, key_list = [], []
                for hit in self._parse_hit(qresult_elem.find('Iteration_hits'),
                        query_id):
                    if hit:
                        # need to keep track of hit IDs, since there could be duplicates,
                        if hit.id in key_list:
                            warnings.warn("Adding hit with BLAST-generated ID "
                                    "%r since hit ID %r is already present "
                                    "in query %r. Your BLAST database may contain "
                                    "duplicate entries." %
                                    (hit._blast_id, hit.id, query_id), BiopythonParserWarning)
                            # fallback to Blast-generated IDs, if the ID is already present
                            # and restore the desc, too
                            hit.description = '%s %s' % (hit.id, hit.description)
                            hit.id = hit._blast_id
                            # and change the hit_id of the HSPs contained
                            for hsp in hit:
                                hsp.hit_id = hit._blast_id
                        else:
                            key_list.append(hit.id)

                        hit_list.append(hit)

                # create qresult and assign its attributes
                qresult = QueryResult(hit_list, query_id)
                qresult.description = query_desc
                qresult.seq_len = int(query_len)
                qresult._blast_id = blast_query_id
                for key, value in self._meta.items():
                    setattr(qresult, key, value)

                # statistics are stored in Iteration_stat's 'grandchildren' with the
                # following DTD
                # <!ELEMENT Statistics (
                #        Statistics_db-num,
                #        Statistics_db-len,
                #        Statistics_hsp-len,
                #        Statistics_eff-space,
                #        Statistics_kappa,
                #        Statistics_lambda,
                #        Statistics_entropy)>

                stat_iter_elem = qresult_elem.find('Iteration_stat')
                if stat_iter_elem is not None:
                    stat_elem = stat_iter_elem.find('Statistics')

                    for key, val_info in _ELEM_QRESULT_OPT.items():
                        value = stat_elem.findtext(key)
                        if value is not None:
                            caster = val_info[1]
                            # recast only if value is not intended to be str
                            if value is not None and caster is not str:
                                value = caster(value)
                            setattr(qresult, val_info[0], value)

                # delete element after we finish parsing it
                qresult_elem.clear()
                yield qresult
Beispiel #23
0
class Hmmer2TextParser(object):
    """Iterator for the HMMER 2.0 text output."""
    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle
        self.buf = []
        self._meta = self.parse_preamble()

    def __iter__(self):
        """Iterate over Hmmer2TextParser, yields query results."""
        for qresult in self.parse_qresult():
            qresult.program = self._meta.get('program')
            qresult.target = self._meta.get('target')
            qresult.version = self._meta.get('version')
            yield qresult

    def read_next(self, rstrip=True):
        """Return the next non-empty line, trailing whitespace removed."""
        if len(self.buf) > 0:
            return self.buf.pop()
        self.line = self.handle.readline()
        while self.line and rstrip and not self.line.strip():
            self.line = self.handle.readline()
        if self.line:
            if rstrip:
                self.line = self.line.rstrip()
        return self.line

    def push_back(self, line):
        """Un-read a line that should not be parsed yet."""
        self.buf.append(line)

    def parse_key_value(self):
        """Parse key-value pair separated by colon."""
        key, value = self.line.split(':', 1)
        return key.strip(), value.strip()

    def parse_preamble(self):
        """Parse HMMER2 preamble."""
        meta = {}
        state = "GENERIC"
        while self.read_next():
            if state == "GENERIC":
                if self.line.startswith('hmm'):
                    meta['program'] = self.line.split('-')[0].strip()
                elif self.line.startswith('HMMER is'):
                    continue
                elif self.line.startswith('HMMER'):
                    meta['version'] = self.line.split()[1]
                elif self.line.count('-') == 36:
                    state = "OPTIONS"
                continue

            assert state == "OPTIONS"
            assert 'program' in meta

            if self.line.count('-') == 32:
                break

            key, value = self.parse_key_value()
            if meta['program'] == 'hmmsearch':
                if key == 'Sequence database':
                    meta['target'] = value
                    continue
            elif meta['program'] == 'hmmpfam':
                if key == 'HMM file':
                    meta['target'] = value
                    continue
            meta[key] = value

        return meta

    def parse_qresult(self):
        """Parse a HMMER2 query block."""
        while self.read_next():
            if not self.line.startswith('Query'):
                return
            _, id_ = self.parse_key_value()
            self.qresult = QueryResult(id=id_)

            description = None

            while self.read_next() and not self.line.startswith('Scores'):
                if self.line.startswith('Accession'):
                    self.qresult.accession = self.parse_key_value()[1]
                if self.line.startswith('Description'):
                    description = self.parse_key_value()[1]

            hit_placeholders = self.parse_hits()
            if len(hit_placeholders) > 0:
                self.parse_hsps(hit_placeholders)
                self.parse_hsp_alignments()

            while not self.line.startswith('Query'):
                self.read_next()
                if not self.line:
                    break
            self.buf.append(self.line)

            if description is not None:
                self.qresult.description = description
            yield self.qresult

    def parse_hits(self):
        """Parse a HMMER2 hit block, beginning with the hit table."""
        hit_placeholders = []
        while self.read_next():
            if self.line.startswith('Parsed'):
                break
            if self.line.find('no hits') > -1:
                break

            if self.line.startswith('Sequence') or \
                    self.line.startswith('Model') or \
                    self.line.startswith('-------- '):
                continue

            fields = self.line.split()
            id_ = fields.pop(0)
            domain_obs_num = int(fields.pop())
            evalue = float(fields.pop())
            bitscore = float(fields.pop())
            description = ' '.join(fields).strip()

            hit = _HitPlaceholder()
            hit.id_ = id_
            hit.evalue = evalue
            hit.bitscore = bitscore
            hit.description = description
            hit.domain_obs_num = domain_obs_num
            hit_placeholders.append(hit)

        return hit_placeholders

    def parse_hsps(self, hit_placeholders):
        """Parse a HMMER2 hsp block, beginning with the hsp table."""
        # HSPs may occur in different order than the hits
        # so store Hit objects separately first
        unordered_hits = {}
        while self.read_next():
            if self.line.startswith('Alignments') or \
                    self.line.startswith('Histogram') or \
                    self.line == '//':
                break
            if self.line.startswith('Model') or \
                    self.line.startswith('Sequence') or \
                    self.line.startswith('--------'):
                continue

            id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \
            score, evalue = self.line.split()

            frag = HSPFragment(id_, self.qresult.id)
            frag.alphabet = generic_protein
            if self._meta['program'] == 'hmmpfam':
                frag.hit_start = int(hmm_f) - 1
                frag.hit_end = int(hmm_t)
                frag.query_start = int(seq_f) - 1
                frag.query_end = int(seq_t)
            elif self._meta['program'] == 'hmmsearch':
                frag.query_start = int(hmm_f) - 1
                frag.query_end = int(hmm_t)
                frag.hit_start = int(seq_f) - 1
                frag.hit_end = int(seq_t)

            hsp = HSP([frag])
            hsp.evalue = float(evalue)
            hsp.bitscore = float(score)
            hsp.domain_index = int(domain.split('/')[0])
            if self._meta['program'] == 'hmmpfam':
                hsp.hit_endtype = hmm_compl
                hsp.query_endtype = seq_compl
            elif self._meta['program'] == 'hmmsearch':
                hsp.query_endtype = hmm_compl
                hsp.hit_endtype = seq_compl

            if id_ not in unordered_hits:
                placeholder = [p for p in hit_placeholders if p.id_ == id_][0]
                hit = placeholder.createHit([hsp])
                unordered_hits[id_] = hit
            else:
                hit = unordered_hits[id_]
                hsp.hit_description = hit.description
                hit.append(hsp)

        # The placeholder list is in the correct order, so use that order for
        # the Hit objects in the qresult
        for p in hit_placeholders:
            self.qresult.append(unordered_hits[p.id_])

    def parse_hsp_alignments(self):
        """Parse a HMMER2 HSP alignment block."""
        if not self.line.startswith('Alignments'):
            return

        while self.read_next():
            if self.line == '//' or self.line.startswith('Histogram'):
                break

            match = re.search(_HSP_ALIGN_LINE, self.line)
            if match is None:
                continue

            id_ = match.group(1)
            idx = int(match.group(2))
            num = int(match.group(3))

            hit = self.qresult[id_]
            if hit.domain_obs_num != num:
                continue

            frag = hit[idx - 1][0]

            hmmseq = ''
            consensus = ''
            otherseq = ''
            structureseq = ''
            pad = 0
            while self.read_next() and self.line.startswith(' '):
                # if there's structure information, parse that
                if self.line[16:18] == 'CS':
                    structureseq += self.line[19:].strip()

                    if not self.read_next():
                        break

                # skip the *-> start marker if it exists
                if self.line[19:22] == '*->':
                    seq = self.line[22:]
                    pad = 3
                else:
                    seq = self.line[19:]
                    pad = 0

                hmmseq += seq
                line_len = len(seq)
                if not self.read_next(rstrip=False):
                    break
                consensus += self.line[19 + pad:19 + pad + line_len]
                # If there's no consensus sequence, hmmer2 doesn't
                # bother to put spaces here, so add extra padding
                extra_padding = len(hmmseq) - len(consensus)
                consensus += ' ' * extra_padding

                if not self.read_next():
                    break

                # if we have a line break in the end marker, we get a
                # whitespace-only otherseq line, making split()[0] return
                # the end coordinate. That'll be a -, which is a valid character
                # in the sequence, meaning we can't just strip it.
                parts = self.line[19:].split()
                if len(parts) == 2:
                    otherseq += self.line[19:].split()[0].strip()

            self.push_back(self.line)

            # get rid of the end marker
            if hmmseq.endswith('<-*'):
                hmmseq = hmmseq[:-3]
                consensus = consensus[:-3]

            # add similarity sequence to annotation
            frag.aln_annotation['similarity'] = consensus

            # if there's structure information, add it to the fragment
            if structureseq:
                frag.aln_annotation['CS'] = structureseq

            if self._meta['program'] == 'hmmpfam':
                frag.hit = hmmseq
                frag.query = otherseq
            else:
                frag.hit = otherseq
                frag.query = hmmseq
Beispiel #24
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []
        cur_qid = None
        cur_hid = None
        while True:
            # store previous line's parsed values, for every line after the 1st
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF
            if self.line and not self.line.startswith('#'):
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
                cur_hid = cur['hit']['id']
            else:
                file_state = state_EOF
                # mock ID values since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # start creating objects after the first line (i.e. prev is filled)
            if prev is not None:
                # each line is basically an HSP with one HSPFragment
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit object when we've finished parsing all its hsps
                # i.e. when hit state is state_HIT_NEW
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Beispiel #25
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                            bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start,
                            bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Beispiel #26
0
    def _parse_qresult(self):
        """Parse query results (PRIVATE)."""
        # parse the queries
        for event, qresult_elem in self.xml_iter:
            # </Iteration> marks the end of a single query
            # which means we can process it
            if event == 'end' and qresult_elem.tag == 'Iteration':

                # we'll use the following schema
                # <!ELEMENT Iteration (
                #        Iteration_iter-num,
                #        Iteration_query-ID?,
                #        Iteration_query-def?,
                #        Iteration_query-len?,
                #        Iteration_hits?,
                #        Iteration_stat?,
                #        Iteration_message?)>

                # assign query attributes with fallbacks
                query_id = qresult_elem.findtext('Iteration_query-ID')
                if query_id is None:
                    query_id = self._fallback['id']

                query_desc = qresult_elem.findtext('Iteration_query-def')
                if query_desc is None:
                    query_desc = self._fallback['description']

                query_len = qresult_elem.findtext('Iteration_query-len')
                if query_len is None:
                    query_len = self._fallback['len']

                blast_query_id = query_id
                # handle blast searches against databases with Blast's IDs
                # 'Query_' marks the beginning of a BLAST+-generated ID,
                # 'lcl|' marks the beginning of a BLAST legacy-generated ID
                if not self._use_raw_query_ids and \
                        (query_id.startswith('Query_') or query_id.startswith('lcl|')):
                    # store the Blast-generated query ID
                    id_desc = query_desc.split(' ', 1)
                    query_id = id_desc[0]
                    try:
                        query_desc = id_desc[1]
                    except IndexError:
                        query_desc = ''

                hit_list, key_list = [], []
                for hit in self._parse_hit(qresult_elem.find('Iteration_hits'),
                                           query_id):
                    if hit:
                        # need to keep track of hit IDs, since there could be duplicates,
                        if hit.id in key_list:
                            warnings.warn(
                                "Renaming hit ID %r to a BLAST-generated ID "
                                "%r since the ID was already matched "
                                "by your query %r. Your BLAST database "
                                "may contain duplicate entries." %
                                (hit.id, hit.blast_id, query_id),
                                BiopythonParserWarning)
                            # fallback to Blast-generated IDs, if the ID is already present
                            # and restore the desc, too
                            hit.description = '%s %s' % (hit.id,
                                                         hit.description)
                            hit.id = hit.blast_id
                            # and change the hit_id of the HSPs contained
                            for hsp in hit:
                                hsp.hit_id = hit.blast_id
                        else:
                            key_list.append(hit.id)

                        hit_list.append(hit)

                # create qresult and assign its attributes
                qresult = QueryResult(hit_list, query_id)
                qresult.description = query_desc
                qresult.seq_len = int(query_len)
                qresult.blast_id = blast_query_id
                for key, value in self._meta.items():
                    setattr(qresult, key, value)

                # statistics are stored in Iteration_stat's 'grandchildren' with the
                # following DTD
                # <!ELEMENT Statistics (
                #        Statistics_db-num,
                #        Statistics_db-len,
                #        Statistics_hsp-len,
                #        Statistics_eff-space,
                #        Statistics_kappa,
                #        Statistics_lambda,
                #        Statistics_entropy)>

                stat_iter_elem = qresult_elem.find('Iteration_stat')
                if stat_iter_elem is not None:
                    stat_elem = stat_iter_elem.find('Statistics')

                    for key, val_info in _ELEM_QRESULT_OPT.items():
                        value = stat_elem.findtext(key)
                        if value is not None:
                            caster = val_info[1]
                            # recast only if value is not intended to be str
                            if value is not None and caster is not str:
                                value = caster(value)
                            setattr(qresult, val_info[0], value)

                # delete element after we finish parsing it
                qresult_elem.clear()
                yield qresult
Beispiel #27
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                                           bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Beispiel #28
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, used to determine what to do with each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        cur_qid = None
        cur_hid = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values if we've past the first line
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF or not a comment line
            if self.line and not self.line.startswith('#'):
                cur = self._parse_result_row()
                cur_qid = self._get_id(cur['qresult'])
                cur_hid = self._get_id(cur['hit'])
            else:
                file_state = state_EOF
                # mock values for cur_qid and cur_hid since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different id or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # we're creating objects for the previously parsed line(s),
            # so nothing is done in the first parsed line (prev == None)
            if prev is not None:
                # every line is essentially an HSP with one fragment, so we
                # create both of these for every line
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    # adjust coordinates to Python range
                    # NOTE: this requires both start and end coords to be
                    # present, otherwise a KeyError will be raised.
                    # Without this limitation, we might misleadingly set the
                    # start / end coords
                    for seq_type in ('query', 'hit'):
                        if attr == seq_type + '_start':
                            value = min(value,
                                        prev['frag'][seq_type + '_end']) - 1
                        elif attr == seq_type + '_end':
                            value = max(value,
                                        prev['frag'][seq_type + '_start'])
                    setattr(frag, attr, value)
                # strand and frame setattr require the full parsed values
                # to be set first
                for seq_type in ('hit', 'query'):
                    # try to set hit and query frame
                    frame = self._get_frag_frame(frag, seq_type, prev['frag'])
                    setattr(frag, '%s_frame' % seq_type, frame)
                    # try to set hit and query strand
                    strand = self._get_frag_strand(frag, seq_type,
                                                   prev['frag'])
                    setattr(frag, '%s_strand' % seq_type, strand)

                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit and append to temp hit container if hit_state
                # says we're not at the same hit or at a new query
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        if attr != 'id_all':
                            setattr(hit, attr, value)
                        else:
                            # not setting hit ID since it's already set from the
                            # prev_hid above
                            setattr(hit, '_id_alt', value[1:])
                    hit_list.append(hit)
                    hsp_list = []
                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline().strip()
    def test_store_bio_searchio_blast_record(self):
        """Run Tests - __init__ and store_searchio_blast_record."""
        null_db, created = Db.objects.get_or_create(name="null")
        null_cv, created = Cv.objects.get_or_create(name="null")
        null_dbxref, created = Dbxref.objects.get_or_create(accession="null",
                                                            db=null_db)
        null_cvterm, created = Cvterm.objects.get_or_create(
            name="null",
            cv=null_cv,
            dbxref=null_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        null_pub, created = Pub.objects.get_or_create(uniquename="null",
                                                      type=null_cvterm,
                                                      is_obsolete=False)

        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        test_organism2, created = Organism.objects.get_or_create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_cv = Cv.objects.create(name="sequence")
        test_db2 = Db.objects.create(name="RO")
        test_cv2 = Cv.objects.create(name="relationship")
        test_dbxref = Dbxref.objects.create(accession="123456", db=test_db)
        test_dbxref2 = Dbxref.objects.create(accession="7890", db=test_db)
        test_aa_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_aa_term2 = Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref2,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref3 = Dbxref.objects.create(accession="1234567", db=test_db)
        Cvterm.objects.create(
            name="match_part",
            cv=test_cv,
            dbxref=test_dbxref3,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref4 = Dbxref.objects.create(accession="12345678", db=test_db2)
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv2,
            dbxref=test_dbxref4,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref5 = Dbxref.objects.create(accession="12345679", db=test_db2)
        Cvterm.objects.create(
            name="in similarity relationship with",
            cv=test_cv2,
            dbxref=test_dbxref5,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref6 = Dbxref.objects.create(accession="22345679", db=test_db2)
        cvterm_translation = Cvterm.objects.create(
            name="translation_of",
            cv=test_cv,
            dbxref=test_dbxref6,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref7 = Dbxref.objects.create(accession="223456", db=test_db)
        test_mrna_term = Cvterm.objects.create(
            name="mRNA",
            cv=test_cv,
            dbxref=test_dbxref7,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db_pfam = Db.objects.create(name="PFAM")
        test_cv_pfam = Cv.objects.create(name="PFAM")
        test_dbxref_pfam_term = Dbxref.objects.create(accession="123",
                                                      db=test_db_pfam)
        test_cvterm_pfam_term = Cvterm.objects.create(
            name="kinase",
            cv=test_cv_pfam,
            dbxref=test_dbxref_pfam_term,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # creating test features
        feature_db = Db.objects.create(name="FASTA_SOURCE")
        feature_dbxref1 = Dbxref.objects.create(db=feature_db,
                                                accession="feat1")
        feature_dbxref2 = Dbxref.objects.create(db=feature_db,
                                                accession="feat2")
        feature_dbxref3 = Dbxref.objects.create(db=feature_db,
                                                accession="feat3")
        feature_dbxref4 = Dbxref.objects.create(db=feature_db,
                                                accession="feat4")
        feature_dbxref5 = Dbxref.objects.create(db=feature_db,
                                                accession="feat5")
        feature_dbxref1m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat1m")
        feature_dbxref2m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat2m")
        feature_dbxref3m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat3m")
        feature_dbxref4m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat4m")
        feature_dbxref5m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat5m")
        f1 = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1",
            is_analysis=False,
            type_id=test_aa_term.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref1,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f2 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat2",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref2,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f3 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat3",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref3,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f4 = Feature.objects.create(
            organism=test_organism,
            uniquename="feat4",
            is_analysis=False,
            type_id=test_aa_term.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref4,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f5 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat5",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref5,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f1m = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref1m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f2m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat2m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref2m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f3m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat3m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref3m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f4m = Feature.objects.create(
            organism=test_organism,
            uniquename="feat4m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref4m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f5m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat5m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref5m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        FeatureRelationship.objects.create(subject=f1m,
                                           object=f1,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f2m,
                                           object=f2,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f3m,
                                           object=f3,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f4m,
                                           object=f4,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f5m,
                                           object=f5,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureCvterm.objects.create(feature=f3,
                                     cvterm=test_cvterm_pfam_term,
                                     pub=null_pub,
                                     is_not=False,
                                     rank=0)

        test_HSPFragment1 = HSPFragment("feat1", "feat2")
        setattr(test_HSPFragment1, "query_start", 110)
        setattr(test_HSPFragment1, "query_end", 1100)
        setattr(test_HSPFragment1, "aln_span", 990)
        setattr(test_HSPFragment1, "hit_start", 100)
        setattr(test_HSPFragment1, "hit_end", 1000)

        test_HSP1 = HSP([test_HSPFragment1])
        setattr(test_HSP1, "query_id", "feat1")
        setattr(test_HSP1, "hit_id", "feat2")
        setattr(test_HSP1, "bitscore", 1234.0)
        setattr(test_HSP1, "bitscore_raw", 1234)
        setattr(test_HSP1, "evalue", 0.0)
        setattr(test_HSP1, "ident_num", 82)

        test_HIT1 = Hit([test_HSP1])
        setattr(test_HIT1, "accession", "5050")
        setattr(test_HIT1, "seq_len", 2000)

        test_HSPFragment2 = HSPFragment("feat1", "feat3")
        setattr(test_HSPFragment2, "query_start", 210)
        setattr(test_HSPFragment2, "query_end", 2100)
        setattr(test_HSPFragment2, "aln_span", 1890)
        setattr(test_HSPFragment2, "hit_start", 200)
        setattr(test_HSPFragment2, "hit_end", 2000)

        test_HSP2 = HSP([test_HSPFragment2])
        setattr(test_HSP2, "query_id", "feat1")
        setattr(test_HSP2, "hit_id", "feat3")
        setattr(test_HSP2, "bitscore", 234.0)
        setattr(test_HSP2, "bitscore_raw", 234)
        setattr(test_HSP2, "evalue", 0.0)
        setattr(test_HSP2, "ident_num", 72)

        test_HIT2 = Hit([test_HSP2])
        setattr(test_HIT2, "accession", "500")
        setattr(test_HIT2, "seq_len", 4000)

        test_result1 = QueryResult([test_HIT1, test_HIT2], "feat1")
        setattr(test_result1, "seq_len", 3000)
        setattr(test_result1, "blast_id", "feat1")

        # test retrieve_query_from_hsp and retrieve_subject_from_hsp
        # test hsp with no bitscore, bitscore_raw, evalue, and ident_num
        test_HSPFragment3 = HSPFragment("feat4_desc", "feat5_desc")
        setattr(test_HSPFragment3, "query_start", 210)
        setattr(test_HSPFragment3, "query_end", 2100)
        setattr(test_HSPFragment3, "aln_span", 1890)
        setattr(test_HSPFragment3, "hit_start", 200)
        setattr(test_HSPFragment3, "hit_end", 2000)

        test_HSP3 = HSP([test_HSPFragment3])
        setattr(test_HSP3, "query_id", "feat4_desc")
        setattr(test_HSP3, "query_description", "test id=feat4")
        setattr(test_HSP3, "hit_id", "feat5_desc")
        setattr(test_HSP3, "hit_description", "test id=feat5")

        test_HIT3 = Hit([test_HSP3])
        setattr(test_HIT3, "seq_len", 4000)

        test_result2 = QueryResult([test_HIT3], "feat4_desc")
        setattr(test_result2, "seq_len", 3000)
        setattr(test_result2, "blast_id", "feat4_desc")

        # test SimilarityLoader fail
        with self.assertRaises(ImportingError):
            SimilarityLoader(
                filename="similarity.file",
                algorithm="smith-waterman",
                description="command-line example",
                program="blastp",
                input_format="blast-xml",
                programversion="2.2.31+",
                so_query="polypeptide",
                so_subject="protein_match",
                org_query="H**o sapiens",
                org_subject="multispecies multispecies",
            )

        test_blast_file = SimilarityLoader(
            filename="similarity.file",
            algorithm="smith-waterman",
            description="command-line example",
            program="interproscan",
            input_format="interproscan-xml",
            programversion="5",
            so_query="polypeptide",
            so_subject="protein_match",
            org_query="Mus musculus",
            org_subject="multispecies multispecies",
        )

        test_blast_file.store_bio_searchio_query_result(test_result1)
        test_blast_file.store_bio_searchio_query_result(test_result2)

        test_analysis = Analysis.objects.get(sourcename="similarity.file")
        self.assertEqual("interproscan", test_analysis.program)

        test_featureloc = Featureloc.objects.get(srcfeature=f3)

        test_analysisfeature = Analysisfeature.objects.get(
            analysis=test_analysis, feature_id=test_featureloc.feature_id)
        self.assertEqual(234.0, test_analysisfeature.rawscore)
        # test remove_feature
        self.assertTrue(
            Analysis.objects.filter(sourcename="similarity.file").exists())
        call_command("remove_analysis", "--name=similarity.file",
                     "--verbosity=0")
        self.assertFalse(
            Analysis.objects.filter(sourcename="similarity.file").exists())