def get_fancy_results_dict(self, max_per_query=10, defline_white_space_mask=None): b6 = b6lib.B6Source(self.output) input_fasta = u.SequenceSource(self.input) target_db = u.SequenceSource(self.target) query_counts = {} fancy_results_dict = {} while next(b6): if b6.entry.query_id not in query_counts: query_counts[b6.entry.query_id] = 1 if query_counts[b6.entry.query_id] - 1 == max_per_query: continue else: query_counts[b6.entry.query_id] += 1 if b6.entry.query_id not in fancy_results_dict: fancy_results_dict[b6.entry.query_id] = [] query_seq = input_fasta.get_seq_by_read_id( b6.entry.query_id).replace('-', '') target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id) if defline_white_space_mask: b6.entry = remove_white_space_mask_from_B6_entry( b6.entry, defline_white_space_mask) # parts that were aligned during the search are being aligned to each other to generate # hsp_match data to include into results query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\ target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) query_aligned, target_aligned = query_aligned.upper( ), target_aligned.upper() coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len hsp_match = ''.join([ '|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned)) ]) entry = copy.deepcopy(b6.entry) entry.coverage = coverage entry.hsp_query = query_aligned entry.hsp_subject = target_aligned entry.hsp_match = hsp_match entry = remove_white_space_mask_from_B6_entry(entry) fancy_results_dict[entry.query_id].append(entry) return fancy_results_dict
def homopolymer_indel_exists(seq1, seq2): seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2) # sometimes alignments look like this: # # CCCGAAAAAA--TAT # CCCGAAA---AATAT # # where the correct alignment should look like this: # # CCCGAAAAAATAT # CCCGAAAAA-TAT # # causes this function to return false. in order to fix that problem # we perform needleman-wunch alignment here: if sum([seq1.count('-'), seq2.count('-')]) > 1: seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', '')) gap_index = seq1.find('-') if gap_index == -1: gap_index = seq2.find('-') # so the gap is in seq2. replace seq1 and 2 so it would be certain # that the sequence with gap is seq1: seq1, seq2 = seq2, seq1 if gap_index == -1: return False isHP = lambda x: len(set(x)) == 1 isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[s:e]) == 1 and seq2[gap_index] == seq2[s] def DownStream(sequence): i = 3 while isHP(sequence[gap_index - i - 1:gap_index]): i += 1 return (gap_index - i, gap_index) def UpStream(sequence): i = 4 while isHP(sequence[gap_index + 1:gap_index + i + 1]): i += 1 return (gap_index + 1, gap_index + i) # check downstream of the gap if gap_index >= 3: if isHPindel(DownStream(seq1)): return True # check upstream of the gap if len(seq1) - gap_index > 3: if isHPindel(UpStream(seq1)): return True return None
def get_fancy_results_dict(self, max_per_query = 10, defline_white_space_mask = None): b6 = b6lib.B6Source(self.output) input_fasta = u.SequenceSource(self.input) target_db = u.SequenceSource(self.target) query_counts = {} fancy_results_dict = {} while b6.next(): if not query_counts.has_key(b6.entry.query_id): query_counts[b6.entry.query_id] = 1 if query_counts[b6.entry.query_id] - 1 == max_per_query: continue else: query_counts[b6.entry.query_id] += 1 if not fancy_results_dict.has_key(b6.entry.query_id): fancy_results_dict[b6.entry.query_id] = [] query_seq = input_fasta.get_seq_by_read_id(b6.entry.query_id).replace('-', '') target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id) if defline_white_space_mask: b6.entry = remove_white_space_mask_from_B6_entry(b6.entry, defline_white_space_mask) # parts that were aligned during the search are being aligned to each other to generate # hsp_match data to include into results query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\ target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) query_aligned, target_aligned = query_aligned.upper(), target_aligned.upper() coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len hsp_match = ''.join(['|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned))]) entry = copy.deepcopy(b6.entry) entry.coverage = coverage entry.hsp_query = query_aligned entry.hsp_subject = target_aligned entry.hsp_match = hsp_match entry = remove_white_space_mask_from_B6_entry(entry) fancy_results_dict[entry.query_id].append(entry) return fancy_results_dict