def parse_qresult(self): """Parse a HMMER2 query block.""" while self.read_next(): if not self.line.startswith('Query'): raise StopIteration() _, id_ = self.parse_key_value() self.qresult = QueryResult(id=id_) description = None while self.read_next() and not self.line.startswith('Scores'): if self.line.startswith('Accession'): self.qresult.accession = self.parse_key_value()[1] if self.line.startswith('Description'): description = self.parse_key_value()[1] hit_placeholders = self.parse_hits() if len(hit_placeholders) > 0: self.parse_hsps(hit_placeholders) self.parse_hsp_alignments() while not self.line.startswith('Query'): self.read_next() if not self.line: break self.buf.append(self.line) if description is not None: self.qresult.description = description yield self.qresult
def _parse_qresult(self): """Parses a HMMER3 query block.""" self._read_until(lambda line: line.startswith('Query:')) while self.line: # get query id and length regx = re.search(_QRE_ID_LEN, self.line) qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { 'seq_len': int(regx.group(2)), 'program': self._meta.get('program'), 'version': self._meta.get('version'), 'target': self._meta.get('target'), } # get description and accession, if they exist qdesc = '<unknown description>' # placeholder while not self.line.startswith('Scores for '): self.line = read_forward(self.handle) if self.line.startswith('Accession:'): acc = self.line.strip().split(' ', 1)[1] qresult_attrs['accession'] = acc.strip() elif self.line.startswith('Description:'): qdesc = self.line.strip().split(' ', 1)[1].strip() qresult_attrs['description'] = qdesc # parse the query hits while self.line and '//' not in self.line: hit_list = self._parse_hit(qid, qdesc) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith('Internal pipeline'): while self.line and '//' not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield # not initializing hit_list directly to handle empty hits # (i.e. need to set its query description manually) qresult = QueryResult(id=qid, hits=hit_list) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle) # HMMER >= 3.1 outputs '[ok]' at the end of all results file, # which means we can break the main loop when we see the line if '[ok]' in self.line: break
def _parse_commented_qresult(self): """Iterator returning `QueryResult` objects from a commented file.""" while True: comments = self._parse_comments() if comments: try: self.fields = comments['fields'] # iterator for the query results qres_iter = self._parse_qresult() except KeyError: # no fields means the query has no results assert 'fields' not in comments # create an iterator returning one empty qresult # if the query has no results qres_iter = iter([QueryResult()]) for qresult in qres_iter: for key, value in comments.items(): setattr(qresult, key, value) yield qresult else: break
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, used to determine what to do with each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values if we've past the first line if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF or not a comment line if self.line and not self.line.startswith('#'): cur = self._parse_result_row() cur_qid = self._get_id(cur['qresult']) cur_hid = self._get_id(cur['hit']) else: file_state = state_EOF # mock values for cur_qid and cur_hid since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different id or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # we're creating objects for the previously parsed line(s), # so nothing is done in the first parsed line (prev == None) if prev is not None: # every line is essentially an HSP with one fragment, so we # create both of these for every line frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): # adjust coordinates to Python range # NOTE: this requires both start and end coords to be # present, otherwise a KeyError will be raised. # Without this limitation, we might misleadingly set the # start / end coords for seq_type in ('query', 'hit'): if attr == seq_type + '_start': value = min(value, prev['frag'][seq_type + '_end']) - 1 elif attr == seq_type + '_end': value = max(value, prev['frag'][seq_type + '_start']) setattr(frag, attr, value) # strand and frame setattr require the full parsed values # to be set first for seq_type in ('hit', 'query'): # try to set hit and query frame frame = self._get_frag_frame(frag, seq_type, prev['frag']) setattr(frag, '%s_frame' % seq_type, frame) # try to set hit and query strand strand = self._get_frag_strand(frag, seq_type, prev['frag']) setattr(frag, '%s_strand' % seq_type, strand) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit and append to temp hit container if hit_state # says we're not at the same hit or at a new query if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline().strip()
def _parse_qresult(self): """Parses query results.""" # parse the queries for event, qresult_elem in self.xml_iter: # </Iteration> marks the end of a single query # which means we can process it if event == 'end' and qresult_elem.tag == 'Iteration': # we'll use the following schema # <!ELEMENT Iteration ( # Iteration_iter-num, # Iteration_query-ID?, # Iteration_query-def?, # Iteration_query-len?, # Iteration_hits?, # Iteration_stat?, # Iteration_message?)> # assign query attributes with fallbacks query_id = qresult_elem.findtext('Iteration_query-ID') if query_id is None: query_id = self._fallback['id'] query_desc = qresult_elem.findtext('Iteration_query-def') if query_desc is None: query_desc = self._fallback['description'] query_len = qresult_elem.findtext('Iteration_query-len') if query_len is None: query_len = self._fallback['len'] # handle blast searches against databases with Blast's IDs # 'Query_' marks the beginning of a BLAST+-generated ID, # 'lcl|' marks the beginning of a BLAST legacy-generated ID if query_id.startswith('Query_') or query_id.startswith('lcl|'): # store the Blast-generated query ID blast_query_id = query_id id_desc = query_desc.split(' ', 1) query_id = id_desc[0] try: query_desc = id_desc[1] except IndexError: query_desc = '' else: blast_query_id = '' hit_list, key_list = [], [] for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), query_id): if hit: # need to keep track of hit IDs, since there could be duplicates, if hit.id in key_list: warnings.warn("Adding hit with BLAST-generated ID " "%r since hit ID %r is already present " "in query %r. Your BLAST database may contain " "duplicate entries." % (hit._blast_id, hit.id, query_id), BiopythonParserWarning) # fallback to Blast-generated IDs, if the ID is already present # and restore the desc, too hit.description = '%s %s' % (hit.id, hit.description) hit.id = hit._blast_id # and change the hit_id of the HSPs contained for hsp in hit: hsp.hit_id = hit._blast_id else: key_list.append(hit.id) hit_list.append(hit) # create qresult and assign its attributes qresult = QueryResult(hit_list, query_id) qresult.description = query_desc qresult.seq_len = int(query_len) qresult._blast_id = blast_query_id for key, value in self._meta.items(): setattr(qresult, key, value) # statistics are stored in Iteration_stat's 'grandchildren' with the # following DTD # <!ELEMENT Statistics ( # Statistics_db-num, # Statistics_db-len, # Statistics_hsp-len, # Statistics_eff-space, # Statistics_kappa, # Statistics_lambda, # Statistics_entropy)> stat_iter_elem = qresult_elem.find('Iteration_stat') if stat_iter_elem is not None: stat_elem = stat_iter_elem.find('Statistics') for key, val_info in _ELEM_QRESULT_OPT.items(): value = stat_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(qresult, val_info[0], value) # delete element after we finish parsing it qresult_elem.clear() yield qresult
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 # initial value dummies qres_state = None file_state = None prev_qid = None cur, prev = None, None # container for Hit objects, used to create QueryResult hit_list = [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid # only parse the result row if it's not EOF # NOTE: we are not parsing the extra '#' lines appended to the end # of hmmer31b1 tabular results since storing them in qresult # objects means we can not do a single-pass parsing if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] else: file_state = state_EOF # mock value for cur_qid, since we have nothing to parse cur_qid = None if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME if prev is not None: # since domain tab formats only have 1 Hit per line # we always create HSPFragment, HSP, and Hit per line prev_hid = prev['hit']['id'] # create fragment and HSP and set their attributes frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) # create Hit and set its attributes hit = Hit([hsp]) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
class Hmmer2TextParser(object): """Iterator for the HMMER 2.0 text output.""" def __init__(self, handle): self.handle = handle self.buf = [] self._meta = self.parse_preamble() def __iter__(self): for qresult in self.parse_qresult(): qresult.program = self._meta.get('program') qresult.target = self._meta.get('target') qresult.version = self._meta.get('version') yield qresult def read_next(self, rstrip=True): """Return the next non-empty line, trailing whitespace removed""" if len(self.buf) > 0: return self.buf.pop() self.line = self.handle.readline() while self.line and rstrip and not self.line.strip(): self.line = self.handle.readline() if self.line: if rstrip: self.line = self.line.rstrip() return self.line def push_back(self, line): """Un-read a line that should not be parsed yet""" self.buf.append(line) def parse_key_value(self): """Parse key-value pair separated by colon (:)""" key, value = self.line.split(':', 1) return key.strip(), value.strip() def parse_preamble(self): """Parse HMMER2 preamble.""" meta = {} state = "GENERIC" while self.read_next(): if state == "GENERIC": if self.line.startswith('hmm'): meta['program'] = self.line.split('-')[0].strip() elif self.line.startswith('HMMER is'): continue elif self.line.startswith('HMMER'): meta['version'] = self.line.split()[1] elif self.line.count('-') == 36: state = "OPTIONS" continue assert state == "OPTIONS" assert 'program' in meta if self.line.count('-') == 32: break key, value = self.parse_key_value() if meta['program'] == 'hmmsearch': if key == 'Sequence database': meta['target'] = value continue elif meta['program'] == 'hmmpfam': if key == 'HMM file': meta['target'] = value continue meta[key] = value return meta def parse_qresult(self): """Parse a HMMER2 query block.""" while self.read_next(): if not self.line.startswith('Query'): raise StopIteration() _, id_ = self.parse_key_value() self.qresult = QueryResult(id=id_) description = None while self.read_next() and not self.line.startswith('Scores'): if self.line.startswith('Accession'): self.qresult.accession = self.parse_key_value()[1] if self.line.startswith('Description'): description = self.parse_key_value()[1] hit_placeholders = self.parse_hits() if len(hit_placeholders) > 0: self.parse_hsps(hit_placeholders) self.parse_hsp_alignments() while not self.line.startswith('Query'): self.read_next() if not self.line: break self.buf.append(self.line) if description is not None: self.qresult.description = description yield self.qresult def parse_hits(self): """Parse a HMMER2 hit block, beginning with the hit table.""" hit_placeholders = [] while self.read_next(): if self.line.startswith('Parsed'): break if self.line.find('no hits') > -1: break if self.line.startswith('Sequence') or \ self.line.startswith('Model') or \ self.line.startswith('-------- '): continue fields = self.line.split() id_ = fields.pop(0) domain_obs_num = int(fields.pop()) evalue = float(fields.pop()) bitscore = float(fields.pop()) description = ' '.join(fields).strip() hit = _HitPlaceholder() hit.id_ = id_ hit.evalue = evalue hit.bitscore = bitscore hit.description = description hit.domain_obs_num = domain_obs_num hit_placeholders.append(hit) return hit_placeholders def parse_hsps(self, hit_placeholders): """Parse a HMMER2 hsp block, beginning with the hsp table.""" # HSPs may occur in different order than the hits # so store Hit objects separately first unordered_hits = {} while self.read_next(): if self.line.startswith('Alignments') or \ self.line.startswith('Histogram') or \ self.line == '//': break if self.line.startswith('Model') or \ self.line.startswith('Sequence') or \ self.line.startswith('--------'): continue id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \ score, evalue = self.line.split() frag = HSPFragment(id_, self.qresult.id) frag.alphabet = generic_protein if self._meta['program'] == 'hmmpfam': frag.hit_start = int(hmm_f) - 1 frag.hit_end = int(hmm_t) frag.query_start = int(seq_f) - 1 frag.query_end = int(seq_t) elif self._meta['program'] == 'hmmsearch': frag.query_start = int(hmm_f) - 1 frag.query_end = int(hmm_t) frag.hit_start = int(seq_f) - 1 frag.hit_end = int(seq_t) hsp = HSP([frag]) hsp.evalue = float(evalue) hsp.bitscore = float(score) hsp.domain_index = int(domain.split('/')[0]) if self._meta['program'] == 'hmmpfam': hsp.hit_endtype = hmm_compl hsp.query_endtype = seq_compl elif self._meta['program'] == 'hmmsearch': hsp.query_endtype = hmm_compl hsp.hit_endtype = seq_compl if id_ not in unordered_hits: placeholder = [ p for p in hit_placeholders if p.id_ == id_][0] hit = placeholder.createHit([hsp]) unordered_hits[id_] = hit else: hit = unordered_hits[id_] hsp.hit_description = hit.description hit.append(hsp) # The placeholder list is in the correct order, so use that order for # the Hit objects in the qresult for p in hit_placeholders: self.qresult.append(unordered_hits[p.id_]) def parse_hsp_alignments(self): """Parse a HMMER2 HSP alignment block.""" if not self.line.startswith('Alignments'): return while self.read_next(): if self.line == '//' or self.line.startswith('Histogram'): break match = re.search(_HSP_ALIGN_LINE, self.line) if match is None: continue id_ = match.group(1) idx = int(match.group(2)) num = int(match.group(3)) hit = self.qresult[id_] if hit.domain_obs_num != num: continue frag = hit[idx-1][0] hmmseq = '' consensus = '' otherseq = '' structureseq = '' pad = 0 while self.read_next() and self.line.startswith(' '): # if there's structure information, parse that if self.line[16:18] == 'CS': structureseq += self.line[19:].strip() if not self.read_next(): break # skip the *-> start marker if it exists if self.line[19] == '*': seq = self.line[22:] pad = 3 else: seq = self.line[19:] pad = 0 # get rid of the end marker if seq.endswith('<-*'): seq = seq[:-3] hmmseq += seq line_len = len(seq) if not self.read_next(rstrip=False): break consensus += self.line[19+pad:19+pad+line_len] # If there's no consensus sequence, hmmer2 doesn't # bother to put spaces here, so add extra padding extra_padding = len(hmmseq) - len(consensus) consensus += ' ' * extra_padding if not self.read_next(): break otherseq += self.line[19:].split()[0].strip() self.push_back(self.line) # add similarity sequence to annotation frag.aln_annotation['similarity'] = consensus # if there's structure information, add it to the fragment if structureseq: frag.aln_annotation['CS'] = structureseq if self._meta['program'] == 'hmmpfam': frag.hit = hmmseq frag.query = otherseq else: frag.hit = otherseq frag.query = hmmseq
def _parse_qresult(self): """Parses query results.""" # parse the queries for event, qresult_elem in self.xml_iter: # </Iteration> marks the end of a single query # which means we can process it if event == 'end' and qresult_elem.tag == 'Iteration': # we'll use the following schema # <!ELEMENT Iteration ( # Iteration_iter-num, # Iteration_query-ID?, # Iteration_query-def?, # Iteration_query-len?, # Iteration_hits?, # Iteration_stat?, # Iteration_message?)> # assign query attributes with fallbacks query_id = qresult_elem.findtext('Iteration_query-ID') if query_id is None: query_id = self._fallback['id'] query_desc = qresult_elem.findtext('Iteration_query-def') if query_desc is None: query_desc = self._fallback['description'] query_len = qresult_elem.findtext('Iteration_query-len') if query_len is None: query_len = self._fallback['len'] # handle blast searches against databases with Blast's IDs # 'Query_' marks the beginning of a BLAST+-generated ID, # 'lcl|' marks the beginning of a BLAST legacy-generated ID if query_id.startswith('Query_') or query_id.startswith( 'lcl|'): # store the Blast-generated query ID blast_query_id = query_id id_desc = query_desc.split(' ', 1) query_id = id_desc[0] try: query_desc = id_desc[1] except IndexError: query_desc = '' else: blast_query_id = '' hit_list, key_list = [], [] for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), query_id): if hit: # need to keep track of hit IDs, since there could be duplicates, if hit.id in key_list: warnings.warn( "Adding hit with BLAST-generated ID " "%r since hit ID %r is already present " "in query %r. Your BLAST database may contain " "duplicate entries." % (hit._blast_id, hit.id, query_id), BiopythonParserWarning) # fallback to Blast-generated IDs, if the ID is already present # and restore the desc, too hit.description = '%s %s' % (hit.id, hit.description) hit.id = hit._blast_id # and change the hit_id of the HSPs contained for hsp in hit: hsp.hit_id = hit._blast_id else: key_list.append(hit.id) hit_list.append(hit) # create qresult and assign its attributes qresult = QueryResult(hit_list, query_id) qresult.description = query_desc qresult.seq_len = int(query_len) qresult._blast_id = blast_query_id for key, value in self._meta.items(): setattr(qresult, key, value) # statistics are stored in Iteration_stat's 'grandchildren' with the # following DTD # <!ELEMENT Statistics ( # Statistics_db-num, # Statistics_db-len, # Statistics_hsp-len, # Statistics_eff-space, # Statistics_kappa, # Statistics_lambda, # Statistics_entropy)> stat_iter_elem = qresult_elem.find('Iteration_stat') if stat_iter_elem is not None: stat_elem = stat_iter_elem.find('Statistics') for key, val_info in _ELEM_QRESULT_OPT.items(): value = stat_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(qresult, val_info[0], value) # delete element after we finish parsing it qresult_elem.clear() yield qresult
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values, for every line after the 1st if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] else: file_state = state_EOF # mock ID values since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # start creating objects after the first line (i.e. prev is filled) if prev is not None: # each line is basically an HSP with one HSPFragment frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit object when we've finished parsing all its hsps # i.e. when hit state is state_HIT_NEW if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): # state values state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummies qres_state, hit_state = None, None file_state = None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] # if the file has c4 alignments, use that as the alignment mark if self.has_c4_alignment: self._ALN_MARK = 'C4 Alignment:' while True: self.read_until(lambda line: line.startswith(self._ALN_MARK)) if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: assert self.line.startswith(self._ALN_MARK), self.line # create temp dicts for storing parsed values header = {'qresult': {}, 'hit': {}, 'hsp': {}} # if the file has c4 alignments, try to parse the header if self.has_c4_alignment: self.read_until(lambda line: line.strip().startswith('Query:')) header = self._parse_alignment_header() # parse the block contents cur = self.parse_alignment_block(header) cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] elif not self.line or self.line.startswith('-- completed '): file_state = state_EOF cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new query if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) hsp_list.append(hsp) if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: # not using append since Exonerate may separate the # same hit if it has different strands qresult.absorb(hit) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult if file_state == state_EOF: break hit_list = [] # only readline() here if we're not parsing C4 alignments # C4 alignments readline() is handled by its parse_alignment_block # function if not self.has_c4_alignment: self.line = self.handle.readline()
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummy values qres_state = None file_state = None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: cur = self._parse_row() cur_qid = cur['qname'] cur_hid = cur['tname'] else: file_state = state_EOF # mock values, since we have nothing to parse cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: # create fragment and HSP and set their attributes hsp = _create_hsp(prev_hid, prev_qid, prev) hsp_list.append(hsp) if hit_state == state_HIT_NEW: # create Hit and set its attributes hit = Hit(hsp_list) hit.seq_len = prev['tsize'] hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: qresult.absorb(hit) qresult.seq_len = prev['qsize'] yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): # state values state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummies qres_state, hit_state = None, None file_state = None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] # if the file has c4 alignments, use that as the alignment mark if self.has_c4_alignment: self._ALN_MARK = 'C4 Alignment:' while True: self.read_until(lambda line: line.startswith(self._ALN_MARK)) if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: assert self.line.startswith(self._ALN_MARK), self.line # create temp dicts for storing parsed values header = {'qresult': {}, 'hit': {}, 'hsp': {}} # if the file has c4 alignments, try to parse the header if self.has_c4_alignment: self.read_until( lambda line: line.strip().startswith('Query:')) header = self._parse_alignment_header() # parse the block contents cur = self.parse_alignment_block(header) cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] elif not self.line or self.line.startswith('-- completed '): file_state = state_EOF cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new query if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) hsp_list.append(hsp) if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: # not using append since Exonerate may separate the # same hit if it has different strands qresult.absorb(hit) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult if file_state == state_EOF: break hit_list = [] # only readline() here if we're not parsing C4 alignments # C4 alignments readline() is handled by its parse_alignment_block # function if not self.has_c4_alignment: self.line = self.handle.readline()
def _parse_qresult(self): # initial qresult value qresult = None hit_rows = [] # state values state_QRES_NEW = 1 state_QRES_HITTAB = 3 state_QRES_CONTENT = 5 state_QRES_END = 7 while True: # one line before the hit table if self.line.startswith('The best scores are:'): qres_state = state_QRES_HITTAB # the end of a query or the file altogether elif self.line.strip() == '>>>///' or not self.line: qres_state = state_QRES_END # the beginning of a new query elif not self.line.startswith('>>>') and '>>>' in self.line: qres_state = state_QRES_NEW # the beginning of the query info and its hits + hsps elif self.line.startswith('>>>') and not \ self.line.strip() == '>>><<<': qres_state = state_QRES_CONTENT # default qres mark else: qres_state = None if qres_state is not None: if qres_state == state_QRES_HITTAB: # parse hit table if flag is set hit_rows = self.__parse_hit_table() elif qres_state == state_QRES_END: yield _set_qresult_hits(qresult, hit_rows) break elif qres_state == state_QRES_NEW: # if qresult is filled, yield it first if qresult is not None: yield _set_qresult_hits(qresult, hit_rows) regx = re.search(_RE_ID_DESC_SEQLEN, self.line) query_id = regx.group(1) seq_len = regx.group(3) desc = regx.group(2) qresult = QueryResult(id=query_id) qresult.seq_len = int(seq_len) # get target from the next line self.line = self.handle.readline() qresult.target = [x for x in self.line.split(' ') if x][1].strip() if desc is not None: qresult.description = desc # set values from preamble for key, value in self._preamble.items(): setattr(qresult, key, value) elif qres_state == state_QRES_CONTENT: assert self.line[3:].startswith(qresult.id), self.line for hit, strand in self._parse_hit(query_id): # HACK: re-set desc, for hsp hit and query description hit.description = hit.description hit.query_description = qresult.description # if hit is not in qresult, append it if hit.id not in qresult: qresult.append(hit) # otherwise, it might be the same hit with a different strand else: # make sure strand is different and then append hsp to # existing hit for hsp in hit.hsps: assert strand != hsp.query_strand qresult[hit.id].append(hsp) self.line = self.handle.readline()