def _parse_hit(self, root_hit_elem, query_id): """Yield a generator object that transforms Iteration_hits XML elements into Hit objects (PRIVATE). :param root_hit_elem: root element of the Iteration_hits tag. :type root_hit_elem: XML element tag :param query_id: QueryResult ID of this Hit :type query_id: string """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # BLAST sometimes mangles the sequence IDs and descriptions, so we need # to extract the actual values. raw_hit_id = hit_elem.findtext('Hit_id') raw_hit_desc = hit_elem.findtext('Hit_def') if not self._use_raw_hit_ids: ids, descs, blast_hit_id = _extract_ids_and_descs(raw_hit_id, raw_hit_desc) else: ids, descs, blast_hit_id = [raw_hit_id], [raw_hit_desc], raw_hit_id hit_id, alt_hit_ids = ids[0], ids[1:] hit_desc, alt_hit_descs = descs[0], descs[1:] hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc hit._id_alt = alt_hit_ids hit._description_alt = alt_hit_descs hit.blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def _parse_hit(self, root_hit_elem, query_id): """Yield a generator object that transforms Iteration_hits XML elements into Hit objects (PRIVATE). :param root_hit_elem: root element of the Iteration_hits tag. :type root_hit_elem: XML element tag :param query_id: QueryResult ID of this Hit :type query_id: string """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # BLAST sometimes mangles the sequence IDs and descriptions, so we need # to extract the actual values. raw_hit_id = hit_elem.findtext('Hit_id') raw_hit_desc = hit_elem.findtext('Hit_def') if not self._use_raw_hit_ids: ids, descs, blast_hit_id = _extract_ids_and_descs(raw_hit_id, raw_hit_desc) else: ids, descs, blast_hit_id = [raw_hit_id], [raw_hit_desc], raw_hit_id hit_id, alt_hit_ids = ids[0], ids[1:] hit_desc, alt_hit_descs = descs[0], descs[1:] hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc hit._id_alt = alt_hit_ids hit._description_alt = alt_hit_descs hit.blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def createHit(self, hsp_list): hit = Hit(hsp_list) hit.id_ = self.id_ hit.evalue = self.evalue hit.bitscore = self.bitscore if self.description: hit.description = self.description hit.domain_obs_num = self.domain_obs_num return hit
def createHit(self, hsp_list): hit = Hit(hsp_list) hit.id_ = self.id_ hit.evalue = self.evalue hit.bitscore = self.bitscore if self.description: hit.description = self.description hit.domain_obs_num = self.domain_obs_num return hit
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) # frag.alphabet = generic_protein if block['query_start']: frag.query_start = block['query_start'] - 1 else: frag.query_start = block['query_start'] frag.query_end = block['query_end'] if block['hit_start']: frag.hit_start = block['hit_start'] - 1 else: frag.hit_start = block['hit_start'] frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] hsp.hit_seq_len = block['hit_seq_len'] hsp.text = block['text'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block["hit_id"] frag = HSPFragment(hit_id, query_id) frag.molecule_type = "protein" frag.query_start = block["query_start"] - 1 frag.query_end = block["query_end"] frag.hit_start = block["hit_start"] - 1 frag.hit_end = block["hit_end"] frag.hit = block["hit_seq"] frag.query = block["query_seq"] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block["description"] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block["evalue"] hsp.score = block["score"] hsp.prob = block["prob"] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block["description"] hit.is_included = is_included hit.evalue = block["evalue"] hit.score = block["score"] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) frag.alphabet = generic_protein frag.query_start = block['query_start'] - 1 frag.query_end = block['query_end'] frag.hit_start = block['hit_start'] - 1 frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _parse_hit(self, query_id): """Parse hit on query identifier (PRIVATE).""" while True: self.line = self.handle.readline() if self.line.startswith(">>"): break state = _STATE_NONE strand = None hsp_list = [] hsp = None parsed_hsp = None hit_desc = None seq_len = None while True: peekline = self.handle.peekline() # yield hit if we've reached the start of a new query or # the end of the search if peekline.strip() in [">>><<<", ">>>///"] or \ (not peekline.startswith(">>>") and ">>>" in peekline): # append last parsed_hsp['hit']['seq'] line if state == _STATE_HIT_BLOCK: parsed_hsp["hit"]["seq"] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.aln_annotation["similarity"] += \ self.line.strip("\r\n") # process HSP alignment and coordinates _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] break # yield hit and create a new one if we're still in the same query elif self.line.startswith(">>"): # try yielding, if we have hsps if hsp_list: _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] # try to get the hit id and desc, and handle cases without descs try: hit_id, hit_desc = self.line[2:].strip().split(" ", 1) except ValueError: hit_id = self.line[2:].strip().split(" ", 1)[0] hit_desc = "" # create the HSP object for Hit frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set or reset the state to none state = _STATE_NONE parsed_hsp = {"query": {}, "hit": {}} # create and append a new HSP if line starts with '>--' elif self.line.startswith(">--"): # set seq attributes of previous hsp _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) # and create a new one frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set the state ~ none yet state = _STATE_NONE parsed_hsp = {"query": {}, "hit": {}} # this is either query or hit data in the HSP, depending on the state elif self.line.startswith(">"): if state == _STATE_NONE: # make sure it's the correct query if not query_id.startswith(self.line[1:].split(" ")[0]): raise ValueError("%r vs %r" % (query_id, self.line)) state = _STATE_QUERY_BLOCK parsed_hsp["query"]["seq"] = "" elif state == _STATE_QUERY_BLOCK: # make sure it's the correct hit assert hit_id.startswith(self.line[1:].split(" ")[0]) state = _STATE_HIT_BLOCK parsed_hsp["hit"]["seq"] = "" # check for conservation block elif self.line.startswith("; al_cons"): state = _STATE_CONS_BLOCK hsp.fragment.aln_annotation["similarity"] = "" elif self.line.startswith(";"): # Fasta outputs do not make a clear distinction between Hit # and HSPs, so we check the attribute names to determine # whether it belongs to a Hit or HSP regx = re.search(_RE_ATTR, self.line.strip()) name = regx.group(1) value = regx.group(2) # for values before the '>...' query block if state == _STATE_NONE: if name in _HSP_ATTR_MAP: attr_name, caster = _HSP_ATTR_MAP[name] if caster is not str: value = caster(value) if name in ["_ident", "_sim"]: value *= 100 setattr(hsp, attr_name, value) # otherwise, pool the values for processing later elif state == _STATE_QUERY_BLOCK: parsed_hsp["query"][name] = value elif state == _STATE_HIT_BLOCK: if name == "_len": seq_len = int(value) else: parsed_hsp["hit"][name] = value # for values in the hit block else: raise ValueError("Unexpected line: %r" % self.line) # otherwise, it must be lines containing the sequences else: assert ">" not in self.line # if we're in hit, parse into hsp.hit if state == _STATE_HIT_BLOCK: parsed_hsp["hit"]["seq"] += self.line.strip() elif state == _STATE_QUERY_BLOCK: parsed_hsp["query"]["seq"] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.fragment.aln_annotation["similarity"] += \ self.line.strip("\r\n") # we should not get here! else: raise ValueError("Unexpected line: %r" % self.line) self.line = self.handle.readline()
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. Arguments: root_hit_elem -- Element object of the Iteration_hits tag. query_id -- String of QueryResult ID of this Hit """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. :param root_hit_elem: root element of the Iteration_hits tag. :type root_hit_elem: XML element tag :param query_id: QueryResult ID of this Hit :type query_id: string """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' # combine primary ID and defline first before splitting full_id_desc = hit_id + ' ' + hit_desc id_descs = [(x.strip(), y.strip()) for x, y in \ [a.split(' ', 1) for a in full_id_desc.split(' >')]] hit_id, hit_desc = id_descs[0] hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc hit._id_alt = [x[0] for x in id_descs[1:]] hit._description_alt = [x[1] for x in id_descs[1:]] # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def _parse_hit(self, query_id): while True: self.line = self.handle.readline() if self.line.startswith('>>'): break state = _STATE_NONE strand = None hsp_list = [] while True: peekline = self.handle.peekline() # yield hit if we've reached the start of a new query or # the end of the search if peekline.strip() in [">>><<<", ">>>///"] or \ (not peekline.startswith('>>>') and '>>>' in peekline): # append last parsed_hsp['hit']['seq'] line if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.aln_annotation['similarity'] += \ self.line.strip('\r\n') # process HSP alignment and coordinates _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] break # yield hit and create a new one if we're still in the same query elif self.line.startswith('>>'): # try yielding, if we have hsps if hsp_list: _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] # try to get the hit id and desc, and handle cases without descs try: hit_id, hit_desc = self.line[2:].strip().split(' ', 1) except ValueError: hit_id = self.line[2:].strip().split(' ', 1)[0] hit_desc = '' # create the HSP object for Hit frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set or reset the state to none state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # create and append a new HSP if line starts with '>--' elif self.line.startswith('>--'): # set seq attributes of previous hsp _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) # and create a new one frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set the state ~ none yet state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # this is either query or hit data in the HSP, depending on the state elif self.line.startswith('>'): if state == _STATE_NONE: # make sure it's the correct query assert query_id.startswith(self.line[1:].split(' ')[0]), \ "%r vs %r" % (query_id, self.line) state = _STATE_QUERY_BLOCK parsed_hsp['query']['seq'] = '' elif state == _STATE_QUERY_BLOCK: # make sure it's the correct hit assert hit_id.startswith(self.line[1:].split(' ')[0]) state = _STATE_HIT_BLOCK parsed_hsp['hit']['seq'] = '' # check for conservation block elif self.line.startswith('; al_cons'): state = _STATE_CONS_BLOCK hsp.fragment.aln_annotation['similarity'] = '' elif self.line.startswith(';'): # Fasta outputs do not make a clear distinction between Hit # and HSPs, so we check the attribute names to determine # whether it belongs to a Hit or HSP regx = re.search(_RE_ATTR, self.line.strip()) name = regx.group(1) value = regx.group(2) # for values before the '>...' query block if state == _STATE_NONE: if name in _HSP_ATTR_MAP: attr_name, caster = _HSP_ATTR_MAP[name] if caster is not str: value = caster(value) if name in ['_ident', '_sim']: value *= 100 setattr(hsp, attr_name, value) # otherwise, pool the values for processing later elif state == _STATE_QUERY_BLOCK: parsed_hsp['query'][name] = value elif state == _STATE_HIT_BLOCK: if name == '_len': seq_len = int(value) else: parsed_hsp['hit'][name] = value # for values in the hit block else: raise ValueError("Unexpected line: %r" % self.line) # otherwise, it must be lines containing the sequences else: assert '>' not in self.line # if we're in hit, parse into hsp.hit if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_QUERY_BLOCK: parsed_hsp['query']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.fragment.aln_annotation['similarity'] += \ self.line.strip('\r\n') # we should not get here! else: raise ValueError("Unexpected line: %r" % self.line) self.line = self.handle.readline()
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. Arguments: root_hit_elem -- Element object of the Iteration_hits tag. query_id -- String of QueryResult ID of this Hit """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. :param root_hit_elem: root element of the Iteration_hits tag. :type root_hit_elem: XML element tag :param query_id: QueryResult ID of this Hit :type query_id: string """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' # combine primary ID and defline first before splitting full_id_desc = hit_id + ' ' + hit_desc id_descs = [(x.strip(), y.strip()) for x, y in \ [a.split(' ', 1) for a in full_id_desc.split(' >')]] hit_id, hit_desc = id_descs[0] hsps = [ hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id) ] hit = Hit(hsps) hit.description = hit_desc hit._id_alt = [x[0] for x in id_descs[1:]] hit._description_alt = [x[1] for x in id_descs[1:]] # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit