def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __init__(self, handle): self.handle = handle self.line = read_forward(self.handle) self._meta = self._parse_preamble()
def _parse_aln_block(self, hid, hsp_list): """Parses a HMMER3 HSP alignment block.""" self.line = read_forward(self.handle) dom_counter = 0 while True: if self.line.startswith('>>') or \ self.line.startswith('Internal pipeline'): return hsp_list assert self.line.startswith(' == domain %i' % (dom_counter + 1)) # alias hsp to local var # but note that we're still changing the attrs of the actual # hsp inside the qresult as we're not creating a copy frag = hsp_list[dom_counter][0] # XXX: should we validate again here? regex is expensive.. # regx = re.search(_HRE_VALIDATE, self.line) # assert hsp.bitscore == float(regx.group(1)) # assert hsp.evalue_cond == float(regx.group(2)) hmmseq = '' aliseq = '' annot = {} self.line = self.handle.readline() # parse all the alignment blocks in the hsp while True: regx = None # check for hit or query line # we don't check for the hit or query id specifically # to anticipate special cases where query id == hit id regx = re.search(_HRE_ID_LINE, self.line) if regx: # the first hit/query self.line we encounter is the hmmseq if len(hmmseq) == len(aliseq): hmmseq += regx.group(2) # and for subsequent self.lines, len(hmmseq) is either # > or == len(aliseq) elif len(hmmseq) > len(aliseq): aliseq += regx.group(2) assert len(hmmseq) >= len(aliseq) # check for start of new domain elif self.line.startswith(' == domain') or \ self.line.startswith('>>') or \ self.line.startswith('Internal pipeline'): frag.aln_annotation = annot if self._meta.get('program') == 'hmmscan': frag.hit = hmmseq frag.query = aliseq elif self._meta.get('program') in ['hmmsearch', 'phmmer']: frag.hit = aliseq frag.query = hmmseq dom_counter += 1 hmmseq = '' aliseq = '' annot = {} break # otherwise check if it's an annotation line and parse it # len(hmmseq) is only != len(aliseq) when the cursor is parsing # the similarity character. Since we're not parsing that, we # check for when the condition is False (i.e. when it's ==) elif len(hmmseq) == len(aliseq): regx = re.search(_HRE_ANNOT_LINE, self.line) if regx: annot_name = regx.group(3) if annot_name in annot: annot[annot_name] += regx.group(2) else: annot[annot_name] = regx.group(2) self.line = self.handle.readline()
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) hdesc = hdesc.strip() # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------') or line.startswith(' [No individual domains')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith(' [No individual domains') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): if attr == "description": cur_val = getattr(hit, attr) if cur_val and value and cur_val.startswith(value): continue setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(' ') if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # set query and hit descriptions if they are defined / nonempty string if qdesc: frag.query_description = qdesc if hdesc: frag.hit_description = hdesc # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)