def parse_hsps(self, hit_placeholders): """Parse a HMMER2 hsp block, beginning with the hsp table.""" # HSPs may occur in different order than the hits # so store Hit objects separately first unordered_hits = {} while self.read_next(): if ( self.line.startswith("Alignments") or self.line.startswith("Histogram") or self.line == "//" ): break if ( self.line.startswith("Model") or self.line.startswith("Sequence") or self.line.startswith("--------") ): continue id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, score, evalue = ( self.line.split() ) frag = HSPFragment(id_, self.qresult.id) frag.alphabet = generic_protein if self._meta["program"] == "hmmpfam": frag.hit_start = int(hmm_f) - 1 frag.hit_end = int(hmm_t) frag.query_start = int(seq_f) - 1 frag.query_end = int(seq_t) elif self._meta["program"] == "hmmsearch": frag.query_start = int(hmm_f) - 1 frag.query_end = int(hmm_t) frag.hit_start = int(seq_f) - 1 frag.hit_end = int(seq_t) hsp = HSP([frag]) hsp.evalue = float(evalue) hsp.bitscore = float(score) hsp.domain_index = int(domain.split("/")[0]) if self._meta["program"] == "hmmpfam": hsp.hit_endtype = hmm_compl hsp.query_endtype = seq_compl elif self._meta["program"] == "hmmsearch": hsp.query_endtype = hmm_compl hsp.hit_endtype = seq_compl if id_ not in unordered_hits: placeholder = [p for p in hit_placeholders if p.id_ == id_][0] hit = placeholder.createHit([hsp]) unordered_hits[id_] = hit else: hit = unordered_hits[id_] hsp.hit_description = hit.description hit.append(hsp) # The placeholder list is in the correct order, so use that order for # the Hit objects in the qresult for p in hit_placeholders: self.qresult.append(unordered_hits[p.id_])
def parse_hsps(self, hit_placeholders): """Parse a HMMER2 hsp block, beginning with the hsp table.""" # HSPs may occur in different order than the hits # so store Hit objects separately first unordered_hits = {} while self.read_next(): if self.line.startswith('Alignments') or \ self.line.startswith('Histogram') or \ self.line == '//': break if self.line.startswith('Model') or \ self.line.startswith('Sequence') or \ self.line.startswith('--------'): continue id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \ score, evalue = self.line.split() frag = HSPFragment(id_, self.qresult.id) frag.alphabet = generic_protein if self._meta['program'] == 'hmmpfam': frag.hit_start = int(hmm_f) - 1 frag.hit_end = int(hmm_t) frag.query_start = int(seq_f) - 1 frag.query_end = int(seq_t) elif self._meta['program'] == 'hmmsearch': frag.query_start = int(hmm_f) - 1 frag.query_end = int(hmm_t) frag.hit_start = int(seq_f) - 1 frag.hit_end = int(seq_t) hsp = HSP([frag]) hsp.evalue = float(evalue) hsp.bitscore = float(score) hsp.domain_index = int(domain.split('/')[0]) if self._meta['program'] == 'hmmpfam': hsp.hit_endtype = hmm_compl hsp.query_endtype = seq_compl elif self._meta['program'] == 'hmmsearch': hsp.query_endtype = hmm_compl hsp.hit_endtype = seq_compl if id_ not in unordered_hits: placeholder = [ p for p in hit_placeholders if p.id_ == id_][0] hit = placeholder.createHit([hsp]) unordered_hits[id_] = hit else: hit = unordered_hits[id_] hsp.hit_description = hit.description hit.append(hsp) # The placeholder list is in the correct order, so use that order for # the Hit objects in the qresult for p in hit_placeholders: self.qresult.append(unordered_hits[p.id_])
def _create_hsp(hid, qid, hspd): """Return a list of HSP objects from the given parsed HSP values (PRIVATE).""" frags = [] # we are iterating over query_ranges, but hit_ranges works just as well for idx, qcoords in enumerate(hspd["query_ranges"]): # get sequences, create object hseqlist = hspd.get("hit") hseq = "" if hseqlist is None else hseqlist[idx] qseqlist = hspd.get("query") qseq = "" if qseqlist is None else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hspd["hit_ranges"][idx][0] frag.hit_end = hspd["hit_ranges"][idx][1] # alignment annotation try: aln_annot = hspd.get("aln_annotation", {}) for key, value in aln_annot.items(): frag.aln_annotation[key] = value[idx] except IndexError: pass # strands frag.query_strand = hspd["query_strand"] frag.hit_strand = hspd["hit_strand"] # and append the hsp object to the list if frag.aln_annotation.get("similarity") is not None: if "#" in frag.aln_annotation["similarity"]: frags.extend(_split_fragment(frag)) continue # try to set frame if there are translation in the alignment if (len(frag.aln_annotation) > 1 or frag.query_strand == 0 or ("vulgar_comp" in hspd and re.search(_RE_TRANS, hspd["vulgar_comp"]))): _set_frame(frag) frags.append(frag) # if the query is protein, we need to change the hit and query sequences # from three-letter amino acid codes to one letter, and adjust their # coordinates accordingly if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query frags = _adjust_aa_seq(frags) hsp = HSP(frags) # set hsp-specific attributes for attr in ( "score", "hit_split_codons", "query_split_codons", "model", "vulgar_comp", "cigar_comp", "alphabet", ): if attr in hspd: setattr(hsp, attr, hspd[attr]) return hsp
def _create_hsp(hid, qid, hspd): """Returns a list of HSP objects from the given parsed HSP values.""" frags = [] # we are iterating over query_ranges, but hit_ranges works just as well for idx, qcoords in enumerate(hspd["query_ranges"]): # get sequences, create object hseqlist = hspd.get("hit") hseq = "" if hseqlist is None else hseqlist[idx] qseqlist = hspd.get("query") qseq = "" if qseqlist is None else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hspd["hit_ranges"][idx][0] frag.hit_end = hspd["hit_ranges"][idx][1] # alignment annotation try: aln_annot = hspd.get("aln_annotation", {}) for key, value in aln_annot.items(): frag.aln_annotation[key] = value[idx] except IndexError: pass # strands frag.query_strand = hspd["query_strand"] frag.hit_strand = hspd["hit_strand"] # and append the hsp object to the list if frag.aln_annotation.get("similarity") is not None: if "#" in frag.aln_annotation["similarity"]: frags.extend(_split_fragment(frag)) continue # try to set frame if there are translation in the alignment if ( len(frag.aln_annotation) > 1 or frag.query_strand == 0 or ("vulgar_comp" in hspd and re.search(_RE_TRANS, hspd["vulgar_comp"])) ): _set_frame(frag) frags.append(frag) # if the query is protein, we need to change the hit and query sequences # from three-letter amino acid codes to one letter, and adjust their # coordinates accordingly if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query frags = _adjust_aa_seq(frags) hsp = HSP(frags) # set hsp-specific attributes for attr in ("score", "hit_split_codons", "query_split_codons", "model", "vulgar_comp", "cigar_comp", "alphabet"): if attr in hspd: setattr(hsp, attr, hspd[attr]) return hsp
def _create_hsp(hid, qid, hspd): """Returns a list of HSP objects from the given parsed HSP values.""" frags = [] # we are iterating over query_ranges, but hit_ranges works just as well for idx, qcoords in enumerate(hspd['query_ranges']): # get sequences, create object hseqlist = hspd.get('hit') hseq = '' if hseqlist is None else hseqlist[idx] qseqlist = hspd.get('query') qseq = '' if qseqlist is None else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hspd['hit_ranges'][idx][0] frag.hit_end = hspd['hit_ranges'][idx][1] # alignment annotation try: aln_annot = hspd.get('aln_annotation', {}) for key, value in aln_annot.items(): frag.aln_annotation[key] = value[idx] except IndexError: pass # strands frag.query_strand = hspd['query_strand'] frag.hit_strand = hspd['hit_strand'] # and append the hsp object to the list if frag.aln_annotation.get('homology') is not None: if '#' in frag.aln_annotation['homology']: frags.extend(_split_fragment(frag)) continue # try to set frame if there are translation in the alignment if len(frag.aln_annotation) > 1 or \ frag.query_strand == 0 or \ ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])): _set_frame(frag) frags.append(frag) # if the query is protein, we need to change the hit and query sequences # from three-letter amino acid codes to one letter, and adjust their # coordinates accordingly if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query frags = _adjust_aa_seq(frags) hsp = HSP(frags) # set hsp-specific attributes for attr in ('score', 'hit_split_codons', 'query_split_codons', \ 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'): if attr in hspd: setattr(hsp, attr, hspd[attr]) return hsp
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) # frag.alphabet = generic_protein if block['query_start']: frag.query_start = block['query_start'] - 1 else: frag.query_start = block['query_start'] frag.query_end = block['query_end'] if block['hit_start']: frag.hit_start = block['hit_start'] - 1 else: frag.hit_start = block['hit_start'] frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] hsp.hit_seq_len = block['hit_seq_len'] hsp.text = block['text'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) frag.alphabet = generic_protein frag.query_start = block['query_start'] - 1 frag.query_end = block['query_end'] frag.hit_start = block['hit_start'] - 1 frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block["hit_id"] frag = HSPFragment(hit_id, query_id) frag.molecule_type = "protein" frag.query_start = block["query_start"] - 1 frag.query_end = block["query_end"] frag.hit_start = block["hit_start"] - 1 frag.hit_end = block["hit_end"] frag.hit = block["hit_seq"] frag.query = block["query_seq"] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block["description"] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block["evalue"] hsp.score = block["score"] hsp.prob = block["prob"] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block["description"] hit.is_included = is_included hit.evalue = block["evalue"] hit.score = block["score"] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------') or line.startswith(' [No individual domains')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith(' [No individual domains') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(' ') if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) hdesc = hdesc.strip() # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------') or line.startswith(' [No individual domains')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith(' [No individual domains') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): if attr == "description": cur_val = getattr(hit, attr) if cur_val and value and cur_val.startswith(value): continue setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(' ') if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # set query and hit descriptions if they are defined / nonempty string if qdesc: frag.query_description = qdesc if hdesc: frag.hit_description = hdesc # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], \ psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], \ psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = zip(qstarts, [x + y for x, y in \ zip(qstarts, psl['blocksizes'])]) hit_range_all = zip(hstarts, [x + y for x, y in \ zip(hstarts, psl['blocksizes'])]) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _create_hsp(hid, qid, psl): """Create high scoring pair object (PRIVATE).""" # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl["strand"][0] == "+" else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl["strand"][1] == "+" else -1 except IndexError: hstrand = 1 # hit strand defaults to plus blocksize_multiplier = 3 if is_protein else 1 # query block starts qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand) # hit block starts if len(psl["strand"]) == 2: hstarts = _reorient_starts( psl["tstarts"], [blocksize_multiplier * i for i in psl["blocksizes"]], psl["tsize"], hstrand, ) else: hstarts = psl["tstarts"] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl["blocksizes"]) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])])) hit_range_all = list( zip( hstarts, [ x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"]) ], )) # check length of sequences and coordinates, all must match if "tseqs" in psl and "qseqs" in psl: assert (len(psl["tseqs"]) == len(psl["qseqs"]) == len(query_range_all) == len(hit_range_all)) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get("tseqs") hseq = "" if not hseqlist else hseqlist[idx] qseqlist = psl.get("qseqs") qseq = "" if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl["qstart"] assert hsp.query_end == psl["qend"] assert hsp.hit_start == psl["tstart"] assert hsp.hit_end == psl["tend"] # and check block spans as well hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all] assert hit_spans == hsp.query_span_all == psl["blocksizes"] # set its attributes hsp.match_num = psl["matches"] hsp.mismatch_num = psl["mismatches"] hsp.match_rep_num = psl["repmatches"] hsp.n_num = psl["ncount"] hsp.query_gapopen_num = psl["qnuminsert"] hsp.query_gap_num = psl["qbaseinsert"] hsp.hit_gapopen_num = psl["tnuminsert"] hsp.hit_gap_num = psl["tbaseinsert"] hsp.ident_num = psl["matches"] + psl["repmatches"] hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"] hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl["strand"]) == 2 return hsp
def _create_hits(self, hit_attrs, qid, qdesc): """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE).""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith("Internal pipeline") or line.startswith(">>")) # start parsing the hsp block hit_list = [] while True: if self.line.startswith("Internal pipeline"): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith(">>") hid, hdesc = self.line[len(">> "):].split(" ", 1) hdesc = hdesc.strip() # read through the hsp table header and move one more line self._read_until( lambda line: line.startswith(" --- ------ ----- --------") or line.startswith(" [No individual domains")) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if (self.line.startswith( " [No targets detected that satisfy") or self.line.startswith(" [No individual domains") or self.line.startswith( "Internal pipeline statistics summary:") or self.line.startswith(" Alignments for each domain:") or self.line.startswith(">>")): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): if attr == "description": cur_val = getattr(hit, attr) if cur_val and value and cur_val.startswith(value): continue setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(" ") if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # set query and hit descriptions if they are defined / nonempty string if qdesc: frag.query_description = qdesc if hdesc: frag.hit_description = hdesc # HMMER3 results are always protein frag.molecule_type = "protein" # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == "!" hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(" Alignments for each domain:"): self._parse_aln_block(hid, hit.hsps)
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list( zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult