def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], \ psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], \ psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = zip(qstarts, [x + y for x, y in \ zip(qstarts, psl['blocksizes'])]) hit_range_all = zip(hstarts, [x + y for x, y in \ zip(hstarts, psl['blocksizes'])]) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _create_hsp(hid, qid, psl): """Create high scoring pair object (PRIVATE).""" # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl["strand"][0] == "+" else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl["strand"][1] == "+" else -1 except IndexError: hstrand = 1 # hit strand defaults to plus blocksize_multiplier = 3 if is_protein else 1 # query block starts qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand) # hit block starts if len(psl["strand"]) == 2: hstarts = _reorient_starts( psl["tstarts"], [blocksize_multiplier * i for i in psl["blocksizes"]], psl["tsize"], hstrand, ) else: hstarts = psl["tstarts"] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl["blocksizes"]) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])])) hit_range_all = list( zip( hstarts, [ x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"]) ], )) # check length of sequences and coordinates, all must match if "tseqs" in psl and "qseqs" in psl: assert (len(psl["tseqs"]) == len(psl["qseqs"]) == len(query_range_all) == len(hit_range_all)) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get("tseqs") hseq = "" if not hseqlist else hseqlist[idx] qseqlist = psl.get("qseqs") qseq = "" if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl["qstart"] assert hsp.query_end == psl["qend"] assert hsp.hit_start == psl["tstart"] assert hsp.hit_end == psl["tend"] # and check block spans as well hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all] assert hit_spans == hsp.query_span_all == psl["blocksizes"] # set its attributes hsp.match_num = psl["matches"] hsp.mismatch_num = psl["mismatches"] hsp.match_rep_num = psl["repmatches"] hsp.n_num = psl["ncount"] hsp.query_gapopen_num = psl["qnuminsert"] hsp.query_gap_num = psl["qbaseinsert"] hsp.hit_gapopen_num = psl["tnuminsert"] hsp.hit_gap_num = psl["tbaseinsert"] hsp.ident_num = psl["matches"] + psl["repmatches"] hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"] hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl["strand"]) == 2 return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list( zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult