def get_fancy_results_dict(self, max_per_query = 10, defline_white_space_mask = None): b6 = b6lib.B6Source(self.output) input_fasta = u.SequenceSource(self.input) target_db = u.SequenceSource(self.target) query_counts = {} fancy_results_dict = {} while next(b6): if b6.entry.query_id not in query_counts: query_counts[b6.entry.query_id] = 1 if query_counts[b6.entry.query_id] - 1 == max_per_query: continue else: query_counts[b6.entry.query_id] += 1 if b6.entry.query_id not in fancy_results_dict: fancy_results_dict[b6.entry.query_id] = [] query_seq = input_fasta.get_seq_by_read_id(b6.entry.query_id).replace('-', '') target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id) if defline_white_space_mask: b6.entry = remove_white_space_mask_from_B6_entry(b6.entry, defline_white_space_mask) # parts that were aligned during the search are being aligned to each other to generate # hsp_match data to include into results query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\ target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) query_aligned, target_aligned = query_aligned.upper(), target_aligned.upper() coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len hsp_match = ''.join(['|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned))]) entry = copy.deepcopy(b6.entry) entry.coverage = coverage entry.hsp_query = query_aligned entry.hsp_subject = target_aligned entry.hsp_match = hsp_match entry = remove_white_space_mask_from_B6_entry(entry) fancy_results_dict[entry.query_id].append(entry) return fancy_results_dict
def get_results_dict(self, mismatches = None, gaps = None, min_identity = None, max_identity = None, penalty_for_terminal_gaps = True): results_dict = {} b6 = b6lib.B6Source(self.output) ids_with_hits = set() while b6.next(): if b6.entry.query_id == b6.entry.subject_id: continue if penalty_for_terminal_gaps: # following correction is to take secret gaps into consideration. # because we are working with reads that are supposed to be almost the # same length, we want query and target to be aligned 100%. sometimes it # is not the case, and mismatches are being calculated by the aligned # part of query or target. for instance if query is this: # # ATCGATCG # # and target is this: # # TATCGATCG # # the alignment discards the T at the beginning and gives 0 mismatches. # here we introduce those gaps back: additional_gaps = 0 if b6.entry.q_start != 1 or b6.entry.s_start != 1: additional_gaps += (b6.entry.q_start - 1) if b6.entry.q_start > b6.entry.s_start else (b6.entry.s_start - 1) if additional_gaps != b6.entry.q_len or b6.entry.s_end != b6.entry.s_len: additional_gaps += (b6.entry.q_len - b6.entry.q_end) if (b6.entry.q_len - b6.entry.q_end) > (b6.entry.s_len - b6.entry.s_end) else (b6.entry.s_len - b6.entry.s_end) identity_penalty = additional_gaps * 100.0 / (b6.entry.q_len + additional_gaps) if identity_penalty: b6.entry.gaps += additional_gaps b6.entry.identity -= identity_penalty # done correcting the hit. carry on. if max_identity is not None: if round(b6.entry.identity, 1) >= round(max_identity, 1): continue if min_identity is not None: if round(b6.entry.identity, 1) < round(min_identity, 1): continue if mismatches is not None: if b6.entry.mismatches != mismatches: continue if gaps is not None: if b6.entry.gaps != gaps: continue # if it made here, we are interested in this one, after all. if b6.entry.query_id not in ids_with_hits: results_dict[b6.entry.query_id] = set() ids_with_hits.add(b6.entry.query_id) results_dict[b6.entry.query_id].add(b6.entry.subject_id) b6.close() return results_dict