Beispiel #1
0
 def is_positive_hit(blast_handle):
     found_list = list()
     records_dict = SearchIO.to_dict(
         SearchIO.parse(blast_handle, 'blast-xml'))
     for seq_id, qresult in records_dict.items():
         if qresult.hsps:
             found = found_list.append((seq_id, True))
         else:
             found = found_list.append((seq_id, False))
             # Maybe do some additional checks about the quality and/or length of the match
     return found_list
Beispiel #2
0
    def parse_blast_output(self, blast_handle):
        """
        test
        :param blast_handle: An xml Blast output file handle from io.StringIO
        :return:
        """
        from Bio import SearchIO

        # Create the list of words to filter out uninformative hits
        filter_strings = [
            'putative', 'protein like', 'protein related',
            'contains similarity to', 'predicted', 'hypothetical protein',
            'unnamed protein product', 'unknown', 'expressed protein',
            'uncharacterized', 'probable', 'possible', 'potential'
        ]

        records_dict = SearchIO.to_dict(
            SearchIO.parse(blast_handle, 'blast-xml'))

        # Open output file handle in write mode
        with open(self.output, 'w') as f:
            for seq_id, qresult in records_dict.items():
                hit_descriptions = []
                e_values = []
                bit_scores = []
                proper_desc = 'hypothetical protein'

                for hit in qresult.hits:
                    hit_desc = hit.description

                    # Filter out uninformative hits here, create a list containing the remaining hits
                    # If there are hits left after the filtering, find the MIH
                    # If no hits are left after filtering, just return "hypothetical protein"
                    if not any(
                        [x in str(hit_desc).lower() for x in filter_strings]):
                        hit_descriptions.append(hit_desc)
                        e_values.append(hit.hsps[0].evalue)
                        bit_scores.append(hit.hsps[0].bitscore)

                if len(hit_descriptions) > 0:
                    proper_desc = self.identify_mih(hit_descriptions,
                                                    bit_scores)

                    # Update description according to "find_best_description"
                    self.contigs_dict[seq_id].description = proper_desc

                    # Write to output file only the ones with new description
                    f.write('>{} {}\n{}\n'.format(
                        seq_id, proper_desc, self.contigs_dict[seq_id].seq))

        # Close file StringIO handle
        blast_handle.close()
Beispiel #3
0
    def filter_blast(blast_handle, output_file, ordered_dict):
        from collections import defaultdict
        records_dict = SearchIO.to_dict(
            SearchIO.parse(blast_handle, 'blast-xml'))

        with open(output_file, 'w') as f:
            for seq_id, qresult in records_dict.items():
                # query_len = qresult.seq_len
                similarity_dict = defaultdict(list)
                if not qresult.hsps:  # query is not in blast database
                    seq = ordered_dict[seq_id].seq
                    desc = ordered_dict[seq_id].desc
                    f.write('>{} {}\n{}\n'.format(seq_id, desc, seq))
                    continue
                for h in qresult.hsps:
                    # Add check for alignment length
                    similarity_dict[seq_id].append(
                        h.aln_annotation['similarity'])

                index_list = list()
                common_variants = list()
                for seq_id, sim_string_list in similarity_dict.items():
                    for s in sim_string_list:
                        # index of mismatches
                        idx = [i for i, char in enumerate(s) if char != '|']
                        index_list.append(idx)
                    for i in index_list[0]:
                        if all([i in sublist for sublist in index_list]):
                            common_variants.append(i)
                    if len(common_variants) > 1:
                        for i, p in enumerate(common_variants):
                            if i < len(common_variants) - 3\
                                    and common_variants[i + 1] - common_variants[i] < 21:
                                seq = ordered_dict[seq_id].seq
                                # make lower at indexes
                                f.write('>{} {}\n{}\n'.format(
                                    seq_id, common_variants,
                                    Methods.lower_indexes(
                                        seq.upper(), common_variants)))
                                break
Beispiel #4
0
 def parse_xml(self):
     print('Parsing blast xml file...')
     qresults = SearchIO.parse(self.input_file, 'blast-xml')
     print('Converting to dictionary...')
     self.search_dict = SearchIO.to_dict(qresults)