def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) # frag.alphabet = generic_protein if block['query_start']: frag.query_start = block['query_start'] - 1 else: frag.query_start = block['query_start'] frag.query_end = block['query_end'] if block['hit_start']: frag.hit_start = block['hit_start'] - 1 else: frag.hit_start = block['hit_start'] frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] hsp.hit_seq_len = block['hit_seq_len'] hsp.text = block['text'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block["hit_id"] frag = HSPFragment(hit_id, query_id) frag.molecule_type = "protein" frag.query_start = block["query_start"] - 1 frag.query_end = block["query_end"] frag.hit_start = block["hit_start"] - 1 frag.hit_end = block["hit_end"] frag.hit = block["hit_seq"] frag.query = block["query_seq"] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block["description"] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block["evalue"] hsp.score = block["score"] hsp.prob = block["prob"] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block["description"] hit.is_included = is_included hit.evalue = block["evalue"] hit.score = block["score"] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) frag.alphabet = generic_protein frag.query_start = block['query_start'] - 1 frag.query_end = block['query_end'] frag.hit_start = block['hit_start'] - 1 frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult