def get_fancy_results_dict(self, max_per_query=10, defline_white_space_mask=None): b6 = b6lib.B6Source(self.output) input_fasta = u.SequenceSource(self.input) target_db = u.SequenceSource(self.target) query_counts = {} fancy_results_dict = {} while b6.next(): if not query_counts.has_key(b6.entry.query_id): query_counts[b6.entry.query_id] = 1 if query_counts[b6.entry.query_id] - 1 == max_per_query: continue else: query_counts[b6.entry.query_id] += 1 if not fancy_results_dict.has_key(b6.entry.query_id): fancy_results_dict[b6.entry.query_id] = [] query_seq = input_fasta.get_seq_by_read_id( b6.entry.query_id).replace('-', '') target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id) if defline_white_space_mask: b6.entry = remove_white_space_mask_from_B6_entry( b6.entry, defline_white_space_mask) # parts that were aligned during the search are being aligned to each other to generate # hsp_match data to include into results query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\ target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) query_aligned, target_aligned = query_aligned.upper( ), target_aligned.upper() coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len hsp_match = ''.join([ '|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned)) ]) entry = copy.deepcopy(b6.entry) entry.coverage = coverage entry.hsp_query = query_aligned entry.hsp_subject = target_aligned entry.hsp_match = hsp_match entry = remove_white_space_mask_from_B6_entry(entry) fancy_results_dict[entry.query_id].append(entry) return fancy_results_dict
def homopolymer_indel_exists(seq1, seq2): seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2) # sometimes alignments look like this: # # CCCGAAAAAA--TAT # CCCGAAA---AATAT # # where the correct alignment should look like this: # # CCCGAAAAAATAT # CCCGAAAAA-TAT # # causes this function to return false. in order to fix that problem # we perform needleman-wunch alignment here: if sum([seq1.count('-'), seq2.count('-')]) > 1: seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', '')) gap_index = seq1.find('-') if gap_index == -1: gap_index = seq2.find('-') # so the gap is in seq2. replace seq1 and 2 so it would be certain # that the sequence with gap is seq1: seq1, seq2 = seq2, seq1 if gap_index == -1: return False isHP = lambda x: len(set(x)) == 1 isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[ s:e]) == 1 and seq2[gap_index] == seq2[s] def DownStream(sequence): i = 3 while isHP(sequence[gap_index - i - 1:gap_index]): i += 1 return (gap_index - i, gap_index) def UpStream(sequence): i = 4 while isHP(sequence[gap_index + 1:gap_index + i + 1]): i += 1 return (gap_index + 1, gap_index + i) # check downstream of the gap if gap_index >= 3: if isHPindel(DownStream(seq1)): return True # check upstream of the gap if len(seq1) - gap_index > 3: if isHPindel(UpStream(seq1)): return True return None
def homopolymer_indel_exists(seq1, seq2): seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2) # sometimes alignments look like this: # # CCCGAAAAAA--TAT # CCCGAAA---AATAT # # where the correct alignment should look like this: # # CCCGAAAAAATAT # CCCGAAAAA-TAT # # causes this function to return false. in order to fix that problem # we perform needleman-wunch alignment here: if sum([seq1.count('-'), seq2.count('-')]) > 1: seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', '')) gap_index = seq1.find('-') if gap_index == -1: gap_index = seq2.find('-') # so the gap is in seq2. replace seq1 and 2 so it would be certain # that the sequence with gap is seq1: seq1, seq2 = seq2, seq1 if gap_index == -1: return False isHP = lambda x: len(set(x)) == 1 isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[s:e]) == 1 and seq2[gap_index] == seq2[s] def DownStream(sequence): i = 3 while isHP(sequence[gap_index - i - 1:gap_index]): i += 1 return (gap_index - i, gap_index) def UpStream(sequence): i = 4 while isHP(sequence[gap_index + 1:gap_index + i + 1]): i += 1 return (gap_index + 1, gap_index + i) # check downstream of the gap if gap_index >= 3: if isHPindel(DownStream(seq1)): return True # check upstream of the gap if len(seq1) - gap_index > 3: if isHPindel(UpStream(seq1)): return True return None
def get_fancy_results_dict(self, max_per_query = 10, defline_white_space_mask = None): b6 = b6lib.B6Source(self.output) input_fasta = u.SequenceSource(self.input) target_db = u.SequenceSource(self.target) query_counts = {} fancy_results_dict = {} while b6.next(): if not query_counts.has_key(b6.entry.query_id): query_counts[b6.entry.query_id] = 1 if query_counts[b6.entry.query_id] - 1 == max_per_query: continue else: query_counts[b6.entry.query_id] += 1 if not fancy_results_dict.has_key(b6.entry.query_id): fancy_results_dict[b6.entry.query_id] = [] query_seq = input_fasta.get_seq_by_read_id(b6.entry.query_id).replace('-', '') target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id) if defline_white_space_mask: b6.entry = remove_white_space_mask_from_B6_entry(b6.entry, defline_white_space_mask) # parts that were aligned during the search are being aligned to each other to generate # hsp_match data to include into results query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\ target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) query_aligned, target_aligned = query_aligned.upper(), target_aligned.upper() coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len hsp_match = ''.join(['|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned))]) entry = copy.deepcopy(b6.entry) entry.coverage = coverage entry.hsp_query = query_aligned entry.hsp_subject = target_aligned entry.hsp_match = hsp_match entry = remove_white_space_mask_from_B6_entry(entry) fancy_results_dict[entry.query_id].append(entry) return fancy_results_dict
def compare_seqs(knownseq, testseq, header): #keep if exact match of substrings if knownseq in testseq or testseq in knownseq: return ">%s\n%s" % (header, testseq) #START: keep if one difference between each sequence alnknown, alnseq = nw_align(knownseq, testseq) #find where to start comparison by trimming gaps at ends knownstart, knownend = find_ends(alnknown) seqstart, seqend = find_ends(alnseq) start = min(seqstart, knownstart) end = max(seqend, knownend) if start < 0 or end < 0: raise ValueError("Start and end must be greater than 0!") #if only one difference, print out sequence if distance(knownseq[start:end], testseq[start:end]) <= 1: return ">%s\n%s" % (header, testseq) return ""
def test_nw_align(self): """Tests for nw_align function.""" (first, second), score = nw_align('ACGU', 'CAGU', return_score=True) self.assertEqual(first, 'AC-GU') self.assertEqual(second, '-CAGU') self.assertEqual(score, 1)
def test_nw_align_empty(self): """Tests for nw_align function.""" (first, second), score = nw_align('', '', return_score=True) self.assertEqual(first, '') self.assertEqual(second, '') self.assertEqual(score, 0)
def test_nw_align(self): """Tests for nw_align function.""" (first, second), score = nw_align("ACGU", "CAGU", return_score=True) self.assertEqual(first, "AC-GU") self.assertEqual(second, "-CAGU") self.assertEqual(score, 1)
def test_nw_align_empty(self): """Tests for nw_align function.""" (first, second), score = nw_align("", "", return_score=True) self.assertEqual(first, "") self.assertEqual(second, "") self.assertEqual(score, 0)
def test_nw_align(self): """Tests for nw_align function.""" (first,second),score = nw_align('ACGU','CAGU',return_score=True) self.assertEqual(first,'AC-GU') self.assertEqual(second,'-CAGU') self.assertEqual(score,1)