def _start_Hit(self): self._blast.alignments.append(Record.Alignment()) self._blast.descriptions.append(Record.Description()) self._blast.multiple_alignment = [] self._hit = self._blast.alignments[-1] self._descr = self._blast.descriptions[-1] self._descr.num_alignments = 0
def _start_Hsp(self): # Note that self._start_Hit() should have been called # to setup things like self._blast.multiple_alignment self._hit.hsps.append(Record.HSP()) self._hsp = self._hit.hsps[-1] self._descr.num_alignments += 1 self._blast.multiple_alignment.append(Record.MultipleAlignment()) self._mult_al = self._blast.multiple_alignment[-1]
def _start_Hit(self): """Start filling records (PRIVATE).""" self._blast.alignments.append(Record.Alignment()) self._blast.descriptions.append(Record.Description()) self._blast.multiple_alignment = [] self._hit = self._blast.alignments[-1] self._descr = self._blast.descriptions[-1] self._descr.num_alignments = 0
def _start_hit(self): """Start filling records (PRIVATE).""" self._blast.alignments.append(Record.Alignment()) self._descr = Record.Description() if self._xml_version == 1 else Record.DescriptionExt() self._blast.descriptions.append(self._descr) self._blast.multiple_alignment = [] self._hit = self._blast.alignments[-1] self._descr.num_alignments = 0
def blastalignmentfromdict(indict): out = Record.Alignment() keys = set(indict.keys()) keys.remove('hsps') for key in keys: setattr(out, key, indict[key]) out.hsps = [hspfromdict(h) for h in indict['hsps']] return out
def blastfromdict(indict): out = Record.Blast() for key in indict.keys(): if key == 'descriptions': out.descriptions = [universalfromdict(d, Record.Description) for d in indict['descriptions']] elif key == 'alignments': out.alignments = [blastalignmentfromdict(balig) for balig in indict['alignments']] else: setattr(out, key, indict[key]) return out
def _start_Iteration(self): self._blast = Record.Blast() pass
def reset(self): """Reset all the data allowing reuse of the BlastParser() object.""" self._records = [] self._header = Record.Header() self._parameters = Record.Parameters() self._parameters.filter = None # Maybe I should update the class?
def _start_Iteration(self): """Start interaction (PRIVATE).""" self._blast = Record.Blast() pass
def _start_hit_descr_item(self): """XML v2. Start hit description item.""" self._hit_descr_item = Record.DescriptionExtItem()
def _start_blast_record(self): """Start interaction (PRIVATE).""" self._blast = Record.Blast()
def split_hsp(hsp, pos, blast_stats_calc): ''' :param pos: Position in HSP to split at (relative to start of HSP). :return: Two new HSPs ''' def trim_ends(q_seq, s_seq, reverse=True): ''' Trim ends of sequences when ends do not help alignment ''' num_bases_to_trim = 0 direction = -1 if reverse else 1 for bases in list(zip(q_seq, s_seq))[::direction]: if '-' in bases or len(set(bases)) == 2: num_bases_to_trim += 1 else: break if num_bases_to_trim == 0: return q_seq, s_seq, 0 elif reverse: return q_seq[:-1 * num_bases_to_trim], s_seq[:-1 * num_bases_to_trim], num_bases_to_trim else: return q_seq[num_bases_to_trim:], s_seq[ num_bases_to_trim:], num_bases_to_trim def get_alignment_stats(q_seq, s_seq): midline = '' num_gap_bases = 0 num_identities = 0 num_mismatches = 0 for i in range(len(q_seq)): if '-' == q_seq[i] or '-' == s_seq[i]: midline += ' ' num_gap_bases += 1 elif q_seq[i] != s_seq[i]: midline += ' ' num_mismatches += 1 else: midline += '|' num_identities += 1 return midline, num_identities, num_mismatches, num_gap_bases # Empty biopython HSP objects hsp_A = Record.HSP() hsp_B = Record.HSP() # Get sequences for new HSPs query_A, sbjct_A, num_bases_trimmed_A = trim_ends(hsp.query[:pos], hsp.sbjct[:pos]) hsp_A.query = query_A hsp_A.sbjct = sbjct_A query_B, sbjct_B, num_bases_trimmed_B = trim_ends(hsp.query[pos:], hsp.sbjct[pos:], False) hsp_B.query = query_B hsp_B.sbjct = sbjct_B hsp_A.align_length = len(query_A) hsp_B.align_length = len(query_B) # Get alignment stats midline_A, num_identities_A, num_mismatches_A, num_gap_bases_A = get_alignment_stats( query_A, sbjct_A) midline_B, num_identities_B, num_mismatches_B, num_gap_bases_B = get_alignment_stats( query_B, sbjct_B) hsp_A.match = midline_A hsp_B.match = midline_B hsp_A.identities = num_identities_A hsp_A.positives = num_identities_A hsp_B.identities = num_identities_B hsp_B.positives = num_identities_B hsp_A.gaps = num_gap_bases_A hsp_B.gaps = num_gap_bases_B # query in positive frame? if hsp.frame[0] > 0: hsp_A.query_start = hsp.query_start hsp_A.query_end = hsp_A.query_start + (hsp_A.align_length - hsp_A.query.count('-') - 1) hsp_B.query_end = hsp.query_end hsp_B.query_start = hsp.query_end - (hsp_B.align_length - hsp_B.query.count('-') - 1) else: raise RuntimeError( 'Negative frame query?? We have not prepared for this situation!!!' ) # subject in positive frame? if hsp.frame[1] > 0: hsp_A.sbjct_start = hsp.sbjct_start hsp_B.sbjct_end = hsp.sbjct_end hsp_A.sbjct_end = hsp_A.sbjct_start + (hsp_A.align_length - hsp_A.sbjct.count('-') - 1) hsp_B.sbjct_start = hsp.sbjct_end - (hsp_B.align_length - hsp_B.sbjct.count('-') - 1) else: hsp_A.sbjct_start = hsp.sbjct_start hsp_A.sbjct_end = hsp_A.sbjct_start - (hsp_A.align_length - hsp_A.sbjct.count('-') - 1) hsp_B.sbjct_end = hsp.sbjct_end hsp_B.sbjct_start = hsp.sbjct_end + (hsp_B.align_length - hsp_B.sbjct.count('-') - 1) # Find number of gap openings openings_A = count_gap_openings(hsp_A.query) + count_gap_openings( hsp_A.sbjct) openings_B = count_gap_openings(hsp_B.query) + count_gap_openings( hsp_B.sbjct) # Now calculate score, bitscore, and e-value hsp_A.score = blast_stats_calc.calculate_alignment_score( hsp_A.identities, num_mismatches_A, hsp_A.gaps, openings_A) hsp_B.score = blast_stats_calc.calculate_alignment_score( hsp_B.identities, num_mismatches_B, hsp_B.gaps, openings_B) hsp_A.bits = blast_stats_calc.calculate_bit_score_from_alignment_score( hsp_A.score) hsp_B.bits = blast_stats_calc.calculate_bit_score_from_alignment_score( hsp_B.score) hsp_A.expect = blast_stats_calc.calculate_e_value_from_bit_score( hsp_A.bits) hsp_B.expect = blast_stats_calc.calculate_e_value_from_bit_score( hsp_B.bits) hsp_A.frame = hsp.frame hsp_B.frame = hsp.frame if hsp_A.align_length == 0: hsp_A = None if hsp_B.align_length == 0: hsp_B = None return hsp_A, hsp_B
def read_aligns(f, R): """ read alignments parse all aligns to end of blast report or to next query sequence issue: now, genebank switched to using accession instead of gis and in blast there are accession.version identifier (and sometimes something different) """ # parse_regexp = '[ACTGUactgu\-]+' parse_regexp_q = '[ACGTUKSYMWRBDHVNacgtuksymwrbdhvn-]+' eor = True txt = bread(f) # get line after alignments # if txt[0] == '>': # enter a record # # create an alignment object if we are sure that there is one # # start of parsing hits to single source sequence while txt and eor: if txt[:6] == 'Query=': break c = re.search(r'(?<=>) *[\S|.]+', txt) def_rem = bread(f) while not re.search('Length=', def_rem): txt += def_rem def_rem = bread(f) # get new alig object curr_alig = Record.Alignment() curr_alig.hit_id = c.group() curr_alig.hit_def = txt[c.end() + 1:] curr_alig.length = int(re.search(r'(?<=Length=)\d+', def_rem).group()) # draw next line txt = bread(f) while txt and eor: # loop the hsps of the record if txt[0] == '>' or txt[:6] == 'Query=': # R.alignments.append(curr_alig) break curr_hsp = Record.HSP() while txt and eor: # parse scores if txt[:8] == " Score =": # changing "score" to bits # because the score is in "bits" units curr_hsp.bits = float( re.search(r'(?<=Score =) *\d+\.?\d*', txt).group(0).lstrip()) # if re.search('bits \(', txt): # changing "bits" to score, because the number in brackets is match score (in nucleotide blast # same as number of identities) curr_hsp.score = int( re.search(r'(?<=bits \() *\d+(?=\))', txt).group(0).lstrip()) # the regexp for scientific format from here: # http://stackoverflow.com/questions/18152597/extract-scientific-number-from-string curr_hsp.expect = float( re.search( r'(?<=Expect =)-? *[0-9]+\.?[0-9]*(?:[Ee] *-? *[0-9]+)?', txt).group(0).lstrip()) elif txt[:13] == " Identities =": # return only first int in identities field to mimic the XML parser tmp_idtts = [ int(i) for i in re.search( r'(?<=Identities =) *\d+/\d+(?= *\()', txt).group( 0).lstrip().split('/') ] tmp_gaps = [ int(i) for i in re.search(r'(?<=Gaps =) *\d+/\d+(?= *\()', txt).group(0).lstrip().split('/') ] curr_hsp.identities = tmp_idtts[0] curr_hsp.gaps = tmp_gaps[0] assert tmp_gaps[1] == tmp_idtts[1] curr_hsp.align_length = tmp_idtts[1] del tmp_idtts del tmp_gaps elif txt[:8] == " Strand=": curr_hsp.strand = tuple( re.search(r'(?<=Strand=)\S+', txt).group(0).rstrip().split('/')) elif txt[:10] == " Features ": # must be enabled even if output isn't used, moves the pointer pas the features field parsed_features = _parse_features(f, txt) curr_hsp.features = parsed_features elif txt[:5] == "Query": break else: ml.debug("line ignored pos {} - {}".format(f.tell(), txt)) # read next line txt = bread(f) # start of alignment (hsps) parse block # hsps starts with "Score =" qseq = '' mid = '' sseq = '' count = 1 # while not re.search('Score =', txt): while txt[:8] != " Score =" or txt[:10] != " Features ": # Features in this part # Features flanking this part # parsing individual hps alignments # alignment may: # 1) continue with another line triple signaled by Query # 2) break # a) continue with next hsps # b) continue with next alignment (different source sequence (organism)) # c) be an EOF (this is not signaled) - tail was already parsed # read alignment # read alignment by triples if not txt[:5] == 'Query': # this is eof (may not be) while txt != '': # if 'Database' in txt or 'database' in txt: # pass if (txt[:6] == 'Query=') or (txt[0] == ">"): # raise AssertionError('parser failed - unparsed record at %s', txt) break txt = bread(f) # eor = False break # get query start only at first instance if count == 1: query_start = int(re.search(r'\d+', txt[5:]).group()) # match query end at each instance query_end = int(re.search(r'\d+$', txt).group()) # allow lowercase masking sequences # add 5 to prevent matching in Query q_info = re.search(parse_regexp_q, txt[5:]) qseq += q_info.group() # get middle line txt = bread(f) mid += txt[q_info.start() + 5:q_info.end() + 5] txt = bread(f) sseq += txt[q_info.start() + 5:q_info.end() + 5] if count == 1: subject_start = int(re.search(r'\d+', txt[5:]).group()) subject_end = int(re.search(r'\d+$', txt).group()) # go next txt = bread(f) count += 1 if (txt[0] == '>') or (txt[:8] == " Score =") or ( txt[:6] == 'Query=') or txt[:10] == " Features ": break # if end of iteration save current hsp and go for next one curr_hsp.query = qseq curr_hsp.match = mid curr_hsp.sbjct = sseq curr_hsp.query_start = query_start curr_hsp.query_end = query_end curr_hsp.sbjct_start = subject_start curr_hsp.sbjct_end = subject_end assert len(qseq) == len(mid) == len(sseq) # append hsps to current alignment curr_alig.hsps.append(curr_hsp) R.alignments.append(curr_alig) if not eor and len(R.alignments) == 0: R.alignments.append(curr_alig) if len(R.alignments) == 0: print('no hits parsed') return R, txt
def _parse_blast_body(f, common_info): """ run on open handle to file for each query return list of blast records web format differ from standalone export format standallone 2.2.8+ output is segmented by query, each query is ended with KH report contain all queries, if no hit for given query: "***** No hits found *****" web: output is segmented by queries, new query is signaled only by "Query=" if no hits are found, the query is not part of the report and not signaled in any way that that it was in the input if no hits are returned blast will not allow txt file download """ # start the parse loop txt = bread(f) record_holder = Record.Blast() update_blast_record(record_holder, common_info) do_query_name = True do_query_length = True R = copy.deepcopy(record_holder) while txt != '': # parse query name if do_query_name and re.search('^Query=', txt): # query_name = re.search('(?<=Query=)\ *\S+', txt).group().lstrip() # it is possible, that no query name is provided (web) R.query = re.search(r'(?<=Query=)\s*\S*', txt).group().lstrip() do_query_name = False txt = bread(f) continue # parse query Lenght if do_query_length and re.search('^Length=', txt): R.query_length = int( re.search(r'(?<=Length=) *\d+', txt).group().lstrip()) txt = bread(f) do_query_length = False continue # skip reference, database and table # skip table # parse hits to sequence if txt[:10] == 'ALIGNMENTS' or txt[0] == '>': # enter the alignments loop if txt[0] == '>': # seek before the line as the subsequent function expects it f.seek(f.tell() - len(txt) - 2) [R, txt] = read_aligns(f, R) # R.query_length = query_length # R.query = query_name # append blast record object to the temporary wrapper yield R R = copy.deepcopy(record_holder) do_query_name = True do_query_length = True elif re.search(r'\*{5}\sNo hits found\s\*{5}', txt) or re.search( "^Query=", txt): # new record yield R R = copy.deepcopy(record_holder) do_query_name = True do_query_length = True txt = bread(f) else: # get the next line txt = bread(f)