def is_positive_hit(blast_handle): found_list = list() records_dict = SearchIO.to_dict( SearchIO.parse(blast_handle, 'blast-xml')) for seq_id, qresult in records_dict.items(): if qresult.hsps: found = found_list.append((seq_id, True)) else: found = found_list.append((seq_id, False)) # Maybe do some additional checks about the quality and/or length of the match return found_list
def parse_blast_output(self, blast_handle): """ test :param blast_handle: An xml Blast output file handle from io.StringIO :return: """ from Bio import SearchIO # Create the list of words to filter out uninformative hits filter_strings = [ 'putative', 'protein like', 'protein related', 'contains similarity to', 'predicted', 'hypothetical protein', 'unnamed protein product', 'unknown', 'expressed protein', 'uncharacterized', 'probable', 'possible', 'potential' ] records_dict = SearchIO.to_dict( SearchIO.parse(blast_handle, 'blast-xml')) # Open output file handle in write mode with open(self.output, 'w') as f: for seq_id, qresult in records_dict.items(): hit_descriptions = [] e_values = [] bit_scores = [] proper_desc = 'hypothetical protein' for hit in qresult.hits: hit_desc = hit.description # Filter out uninformative hits here, create a list containing the remaining hits # If there are hits left after the filtering, find the MIH # If no hits are left after filtering, just return "hypothetical protein" if not any( [x in str(hit_desc).lower() for x in filter_strings]): hit_descriptions.append(hit_desc) e_values.append(hit.hsps[0].evalue) bit_scores.append(hit.hsps[0].bitscore) if len(hit_descriptions) > 0: proper_desc = self.identify_mih(hit_descriptions, bit_scores) # Update description according to "find_best_description" self.contigs_dict[seq_id].description = proper_desc # Write to output file only the ones with new description f.write('>{} {}\n{}\n'.format( seq_id, proper_desc, self.contigs_dict[seq_id].seq)) # Close file StringIO handle blast_handle.close()
def filter_blast(blast_handle, output_file, ordered_dict): from collections import defaultdict records_dict = SearchIO.to_dict( SearchIO.parse(blast_handle, 'blast-xml')) with open(output_file, 'w') as f: for seq_id, qresult in records_dict.items(): # query_len = qresult.seq_len similarity_dict = defaultdict(list) if not qresult.hsps: # query is not in blast database seq = ordered_dict[seq_id].seq desc = ordered_dict[seq_id].desc f.write('>{} {}\n{}\n'.format(seq_id, desc, seq)) continue for h in qresult.hsps: # Add check for alignment length similarity_dict[seq_id].append( h.aln_annotation['similarity']) index_list = list() common_variants = list() for seq_id, sim_string_list in similarity_dict.items(): for s in sim_string_list: # index of mismatches idx = [i for i, char in enumerate(s) if char != '|'] index_list.append(idx) for i in index_list[0]: if all([i in sublist for sublist in index_list]): common_variants.append(i) if len(common_variants) > 1: for i, p in enumerate(common_variants): if i < len(common_variants) - 3\ and common_variants[i + 1] - common_variants[i] < 21: seq = ordered_dict[seq_id].seq # make lower at indexes f.write('>{} {}\n{}\n'.format( seq_id, common_variants, Methods.lower_indexes( seq.upper(), common_variants))) break
def parse_xml(self): print('Parsing blast xml file...') qresults = SearchIO.parse(self.input_file, 'blast-xml') print('Converting to dictionary...') self.search_dict = SearchIO.to_dict(qresults)