def parse_hsps(self, hit_placeholders):
        """Parse a HMMER2 hsp block, beginning with the hsp table."""
        # HSPs may occur in different order than the hits
        # so store Hit objects separately first
        unordered_hits = {}
        while self.read_next():
            if (
                self.line.startswith("Alignments")
                or self.line.startswith("Histogram")
                or self.line == "//"
            ):
                break
            if (
                self.line.startswith("Model")
                or self.line.startswith("Sequence")
                or self.line.startswith("--------")
            ):
                continue

            id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, score, evalue = (
                self.line.split()
            )

            frag = HSPFragment(id_, self.qresult.id)
            frag.alphabet = generic_protein
            if self._meta["program"] == "hmmpfam":
                frag.hit_start = int(hmm_f) - 1
                frag.hit_end = int(hmm_t)
                frag.query_start = int(seq_f) - 1
                frag.query_end = int(seq_t)
            elif self._meta["program"] == "hmmsearch":
                frag.query_start = int(hmm_f) - 1
                frag.query_end = int(hmm_t)
                frag.hit_start = int(seq_f) - 1
                frag.hit_end = int(seq_t)

            hsp = HSP([frag])
            hsp.evalue = float(evalue)
            hsp.bitscore = float(score)
            hsp.domain_index = int(domain.split("/")[0])
            if self._meta["program"] == "hmmpfam":
                hsp.hit_endtype = hmm_compl
                hsp.query_endtype = seq_compl
            elif self._meta["program"] == "hmmsearch":
                hsp.query_endtype = hmm_compl
                hsp.hit_endtype = seq_compl

            if id_ not in unordered_hits:
                placeholder = [p for p in hit_placeholders if p.id_ == id_][0]
                hit = placeholder.createHit([hsp])
                unordered_hits[id_] = hit
            else:
                hit = unordered_hits[id_]
                hsp.hit_description = hit.description
                hit.append(hsp)

        # The placeholder list is in the correct order, so use that order for
        # the Hit objects in the qresult
        for p in hit_placeholders:
            self.qresult.append(unordered_hits[p.id_])
Example #2
0
    def parse_hsps(self, hit_placeholders):
        """Parse a HMMER2 hsp block, beginning with the hsp table."""
        # HSPs may occur in different order than the hits
        # so store Hit objects separately first
        unordered_hits = {}
        while self.read_next():
            if self.line.startswith('Alignments') or \
               self.line.startswith('Histogram') or \
               self.line == '//':
                break
            if self.line.startswith('Model') or \
               self.line.startswith('Sequence') or \
               self.line.startswith('--------'):
                continue

            id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \
            score, evalue = self.line.split()

            frag = HSPFragment(id_, self.qresult.id)
            frag.alphabet = generic_protein
            if self._meta['program'] == 'hmmpfam':
                frag.hit_start = int(hmm_f) - 1
                frag.hit_end = int(hmm_t)
                frag.query_start = int(seq_f) - 1
                frag.query_end = int(seq_t)
            elif self._meta['program'] == 'hmmsearch':
                frag.query_start = int(hmm_f) - 1
                frag.query_end = int(hmm_t)
                frag.hit_start = int(seq_f) - 1
                frag.hit_end = int(seq_t)

            hsp = HSP([frag])
            hsp.evalue = float(evalue)
            hsp.bitscore = float(score)
            hsp.domain_index = int(domain.split('/')[0])
            if self._meta['program'] == 'hmmpfam':
                hsp.hit_endtype = hmm_compl
                hsp.query_endtype = seq_compl
            elif self._meta['program'] == 'hmmsearch':
                hsp.query_endtype = hmm_compl
                hsp.hit_endtype = seq_compl

            if id_ not in unordered_hits:
                placeholder = [ p for p in hit_placeholders if p.id_ == id_][0]
                hit = placeholder.createHit([hsp])
                unordered_hits[id_] = hit
            else:
                hit = unordered_hits[id_]
                hsp.hit_description = hit.description
                hit.append(hsp)

        # The placeholder list is in the correct order, so use that order for
        # the Hit objects in the qresult
        for p in hit_placeholders:
            self.qresult.append(unordered_hits[p.id_])
Example #3
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block['hit_id']

            frag = HSPFragment(hit_id, query_id)
            # frag.alphabet = generic_protein
            if block['query_start']:
                frag.query_start = block['query_start'] - 1
            else:
                frag.query_start = block['query_start']
            frag.query_end = block['query_end']
            if block['hit_start']:
                frag.hit_start = block['hit_start'] - 1
            else:
                frag.hit_start = block['hit_start']
            frag.hit_end = block['hit_end']
            frag.hit = block['hit_seq']
            frag.query = block['query_seq']

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block['description']
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block['evalue']
            hsp.score = block['score']
            hsp.prob = block['prob']
            hsp.hit_seq_len = block['hit_seq_len']
            hsp.text = block['text']

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block['description']
                hit.is_included = is_included
                hit.evalue = block['evalue']
                hit.score = block['score']
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Example #4
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block["hit_id"]

            frag = HSPFragment(hit_id, query_id)
            frag.molecule_type = "protein"
            frag.query_start = block["query_start"] - 1
            frag.query_end = block["query_end"]
            frag.hit_start = block["hit_start"] - 1
            frag.hit_end = block["hit_end"]
            frag.hit = block["hit_seq"]
            frag.query = block["query_seq"]

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block["description"]
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block["evalue"]
            hsp.score = block["score"]
            hsp.prob = block["prob"]

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block["description"]
                hit.is_included = is_included
                hit.evalue = block["evalue"]
                hit.score = block["score"]
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Example #5
0
    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block['hit_id']

            frag = HSPFragment(hit_id, query_id)
            frag.alphabet = generic_protein
            frag.query_start = block['query_start'] - 1
            frag.query_end = block['query_end']
            frag.hit_start = block['hit_start'] - 1
            frag.hit_end = block['hit_end']
            frag.hit = block['hit_seq']
            frag.query = block['query_seq']

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block['description']
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block['evalue']
            hsp.score = block['score']
            hsp.prob = block['prob']

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block['description']
                hit.is_included = is_included
                hit.evalue = block['evalue']
                hit.score = block['score']
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]
Example #6
0
    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parses a HMMER3 hsp block, beginning with the hsp table."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith('Internal pipeline')
                or line.startswith('>>'))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith('Internal pipeline'):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith('>>')
            hid, hdesc = self.line[len('>> '):].split('  ', 1)

            # read through the hsp table header and move one more line
            self._read_until(lambda line:
                    line.startswith(' ---   ------ ----- --------') or
                    line.startswith('   [No individual domains'))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if self.line.startswith('   [No targets detected that satisfy') or \
                   self.line.startswith('   [No individual domains') or \
                   self.line.startswith('Internal pipeline statistics summary:') or \
                   self.line.startswith('  Alignments for each domain:') or \
                   self.line.startswith('>>'):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(' ') if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # HMMER3 alphabets are always protein alphabets
                frag.alphabet = generic_protein
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == '!'
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith('  Alignments for each domain:'):
                self._parse_aln_block(hid, hit.hsps)
Example #7
0
    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parses a HMMER3 hsp block, beginning with the hsp table."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith('Internal pipeline')
                         or line.startswith('>>'))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith('Internal pipeline'):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith('>>')
            hid, hdesc = self.line[len('>> '):].split('  ', 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(lambda line:
                    line.startswith(' ---   ------ ----- --------') or
                    line.startswith('   [No individual domains'))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if self.line.startswith('   [No targets detected that satisfy') or \
                   self.line.startswith('   [No individual domains') or \
                   self.line.startswith('Internal pipeline statistics summary:') or \
                   self.line.startswith('  Alignments for each domain:') or \
                   self.line.startswith('>>'):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(' ') if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 alphabets are always protein alphabets
                frag.alphabet = generic_protein
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == '!'
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith('  Alignments for each domain:'):
                self._parse_aln_block(hid, hit.hsps)
Example #8
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                                           bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Example #9
0
    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                            bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start,
                            bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult
Example #10
0
    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE)."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith("Internal pipeline") or
                         line.startswith(">>"))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith("Internal pipeline"):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith(">>")
            hid, hdesc = self.line[len(">> "):].split("  ", 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(
                lambda line: line.startswith(" ---   ------ ----- --------") or
                line.startswith("   [No individual domains"))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if (self.line.startswith(
                        "   [No targets detected that satisfy")
                        or self.line.startswith("   [No individual domains")
                        or self.line.startswith(
                            "Internal pipeline statistics summary:") or
                        self.line.startswith("  Alignments for each domain:")
                        or self.line.startswith(">>")):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(" ") if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 results are always protein
                frag.molecule_type = "protein"
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get("program") == "hmmscan":
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == "!"
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get("program") == "hmmscan":
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith("  Alignments for each domain:"):
                self._parse_aln_block(hid, hit.hsps)
Example #11
0
    def __iter__(self):
        """Iterate over BlastTextParser, yields query results."""
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith(">"):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(" ", 1)
            except ValueError:
                qid, qdesc = rec.query, ""
            qdesc = qdesc.replace("\n", "").replace("\r", "")

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine molecule_type based on program
            if qresult.program == "blastn":
                molecule_type = "DNA"
            elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]:
                molecule_type = "protein"

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith("> "):
                    aln.title = aln.title[2:]
                elif aln.title.startswith(">"):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(" ", 1)
                except ValueError:
                    hid, hdesc = aln.title, ""
                hdesc = hdesc.replace("\n", "").replace("\r", "")

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.molecule_type = molecule_type
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ""
                    hseq = ""
                    midline = ""
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == " " or hchar == " ":
                            assert all(" " == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation["similarity"] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult