def guess_amplicons(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap,aln_matrix,needleman_wunsch_gap_open,needleman_wunsch_gap_extend,min_freq_to_consider=0.2,amplicon_similarity_cutoff=0.95): """ guesses the amplicons used in an experiment by examining the most frequent read (giant caveat -- most frequent read should be unmodified) input: fastq_r1: path to fastq r1 (can be gzipped) fastq_r2: path to fastq r2 (can be gzipped) number_of_reads_to_consider: number of reads from the top of the file to examine flash_command: command to call flash min_paired_end_reads_overlap: min overlap in bp for flashing (merging) r1 and r2 max_paired_end_reads_overlap: max overlap in bp for flashing (merging) r1 and r2 needleman_wunsch_gap_open: alignment penalty assignment used to determine similarity of two sequences needleman_wunsch_gap_extend: alignment penalty assignment used to determine similarity of two sequences min_freq_to_consider: selected ampilcon must be frequent at least at this percentage in the population amplicon_similarity_cutoff: if the current amplicon has similarity of greater than this cutoff to any other existing amplicons, it won't be added returns: list of putative amplicons """ seq_lines = get_most_frequent_reads(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap) curr_amplicon_id = 1 amplicon_seq_arr = [] #add most frequent amplicon to the list count,seq = seq_lines[0].strip().split() amplicon_seq_arr.append(seq) curr_amplicon_id += 1 #for the remainder of the amplicons, test them before adding for i in range(1,len(seq_lines)): count,seq = seq_lines[i].strip().split() last_count,last_seq = seq_lines[i-1].strip().split() #if this allele is present in at least XX% of the samples if float(last_count)/float(number_of_reads_to_consider) > min_freq_to_consider: this_amplicon_seq_arr = amplicon_seq_arr[:] this_amplicon_max_pct = 0 #keep track of similarity to most-similar already-found amplicons for amp_seq in this_amplicon_seq_arr: ref_incentive = np.zeros(len(amp_seq)+1,dtype=np.int) fws1,fws2,fwscore=CRISPResso2Align.global_align(seq,amp_seq,matrix=aln_matrix,gap_incentive=ref_incentive,gap_open=needleman_wunsch_gap_open,gap_extend=needleman_wunsch_gap_extend,) rvs1,rvs2,rvscore=CRISPResso2Align.global_align(reverse_complement(seq),amp_seq,matrix=aln_matrix,gap_incentive=ref_incentive,gap_open=needleman_wunsch_gap_open,gap_extend=needleman_wunsch_gap_extend,) #if the sequence is similar to a previously-seen read, don't add it min_len = min(len(last_seq),len(seq)) max_score = max(fwscore,rvscore) if max_score/float(min_len) > this_amplicon_max_pct: this_amplicon_max_pct = max_score/float(min_len) #if this amplicon was maximally-similar to all other chosen amplicons by less than amplicon_similarity_cutoff, add to the list if this_amplicon_max_pct < amplicon_similarity_cutoff: amplicon_seq_arr.append(seq) curr_amplicon_id += 1 else: break return amplicon_seq_arr
def RunCRISPResso2(self, strQuerySeqAfterBarcode, strRefSeqAfterBarcode, npGapIncentive): listResult = CRISPResso2Align.global_align( strQuerySeqAfterBarcode.upper(), strRefSeqAfterBarcode.upper(), matrix=self.npAlnMatrix, gap_open=self.floOg, gap_extend=self.floOe, gap_incentive=npGapIncentive) return listResult
def test_global_align(): """General alignment tests.""" seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 0], dtype=int)) assert seq1 == 'ATTA' assert seq2 == 'ATTA' assert score == 100
def guess_guides(amplicon_sequence, fastq_r1, fastq_r2, number_of_reads_to_consider, flash_command, max_paired_end_reads_overlap, min_paired_end_reads_overlap, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend, min_edit_freq_to_consider=0.1, pam_seq="NGG", min_pct_subs_in_base_editor_win=0.8): """ guesses the guides used in an experiment by identifying the most-frequently edited positions, editing types, and PAM sites input: ampilcon_sequence - amplicon to analyze fastq_r1: path to fastq r1 (can be gzipped) fastq_r2: path to fastq r2 (can be gzipped) number_of_reads_to_consider: number of reads from the top of the file to examine flash_command: command to call flash min_paired_end_reads_overlap: min overlap in bp for flashing (merging) r1 and r2 max_paired_end_reads_overlap: max overlap in bp for flashing (merging) r1 and r2 needleman_wunsch_gap_open: alignment penalty assignment used to determine similarity of two sequences needleman_wunsch_gap_extend: alignment penalty assignment used to determine similarity of two sequences min_edit_freq_to_consider: edits must be at least this frequency for consideration pam_seq: pam sequence to look for (can be regex or contain degenerate bases) min_pct_subs_in_base_editor_win: if at least this percent of substitutions happen in the predicted base editor window, return base editor flag returns: tuple of (putative guide, boolean is_base_editor) or (None, None) """ seq_lines = get_most_frequent_reads(fastq_r1, fastq_r2, number_of_reads_to_consider, flash_command, max_paired_end_reads_overlap, min_paired_end_reads_overlap) amp_len = len(amplicon_sequence) gap_incentive = np.zeros(amp_len + 1, dtype=np.int) include_idxs = set(range(0, amp_len)) all_indel_count_vector = np.zeros(amp_len) all_sub_count_vector = np.zeros(amp_len) tot_count = 0 for i in range(len(seq_lines)): count, seq = seq_lines[i].strip().split() count = int(count) tot_count += count fws1, fws2, fwscore = CRISPResso2Align.global_align( seq, amplicon_sequence, matrix=aln_matrix, gap_incentive=gap_incentive, gap_open=needleman_wunsch_gap_open, gap_extend=needleman_wunsch_gap_extend, ) payload = CRISPRessoCOREResources.find_indels_substitutions( fws1, fws2, include_idxs) all_indel_count_vector[payload['all_insertion_positions']] += count all_indel_count_vector[payload['all_deletion_positions']] += count all_sub_count_vector[payload['all_substitution_positions']] += count max_loc = np.argmax(all_indel_count_vector) max_val = all_indel_count_vector[max_loc] #return nothing if the max edit doesn't break threshold if max_val / float(tot_count) < min_edit_freq_to_consider: return (None, None) pam_regex_string = pam_seq.upper() pam_regex_string = pam_regex_string.replace('I', '[ATCG]') pam_regex_string = pam_regex_string.replace('N', '[ATCG]') pam_regex_string = pam_regex_string.replace('R', '[AG]') pam_regex_string = pam_regex_string.replace('Y', '[CT]') pam_regex_string = pam_regex_string.replace('S', '[GC]') pam_regex_string = pam_regex_string.replace('W', '[AT]') pam_regex_string = pam_regex_string.replace('K', '[GT]') pam_regex_string = pam_regex_string.replace('M', '[AC]') pam_regex_string = pam_regex_string.replace('B', '[CGT]') pam_regex_string = pam_regex_string.replace('D', '[AGT]') pam_regex_string = pam_regex_string.replace('H', '[ACT]') pam_regex_string = pam_regex_string.replace('V', '[ACG]') is_base_editor = False #offset from expected position for offset in (0, +1, -1, +2, +3, +4, -2): #forward direction #find pam near max edit loc pam_start = max_loc + 4 + offset pam_end = max_loc + 7 + offset guide_start = max_loc - 16 + offset guide_end = max_loc + 4 + offset base_edit_start = max_loc - 16 + offset base_edit_end = max_loc - 6 + offset if pam_start > 0 and guide_end < amp_len: if re.match(pam_regex_string, amplicon_sequence[pam_start:pam_end]): guide_seq = amplicon_sequence[guide_start:guide_end] sum_base_edits = sum( all_sub_count_vector[base_edit_start:base_edit_end]) #if a lot of edits are in the predicted base editor window, set base editor true #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window if sum_base_edits > min_pct_subs_in_base_editor_win * sum( all_sub_count_vector): is_base_editor = True return (guide_seq, is_base_editor) #reverse direction pam_start = max_loc - 5 - offset pam_end = max_loc - 2 - offset guide_start = max_loc - 2 - offset guide_end = max_loc + 18 - offset base_edit_start = max_loc + 8 - offset base_edit_end = max_loc + 18 - offset if pam_start > 0 and guide_end < amp_len: if re.match(pam_regex_string, amplicon_sequence[pam_start:pam_end]): guide_seq = amplicon_sequence[guide_start:guide_end] sum_base_edits = sum( all_sub_count_vector[base_edit_start:base_edit_end]) #if a lot of edits are in the predicted base editor window, set base editor true #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window if sum_base_edits > min_pct_subs_in_base_editor_win * sum( all_sub_count_vector): is_base_editor = True return (guide_seq, is_base_editor) return (None, None)
def test_global_align_gap_incentive_s1(): """Test the global_align gap incentives for gaps in sequence 1 (the first sequence).""" seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 0, 0], dtype=int)) # print('seq1: ' + seq1 + ' seq2: ' + seq2 + ' score ' + str(score)) assert seq1 == 'ATTTA' assert seq2 == 'ATTTA' assert score == 100 seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [1, 0, 0, 0, 0], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'ATT-A' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 1, 0, 0, 0], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'A-TTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 1, 0, 0], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'AT-TA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 1, 0], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'ATT-A' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 1, 0], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'ATT-A' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTTA', 'ATTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 1], dtype=int)) assert seq1 == 'ATTTA' assert seq2 == 'ATT-A' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 0], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == 'TTTT-' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [1, 0, 0, 0, 0], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == '-TTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 1, 0, 0, 0], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == 'T-TTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 1, 0, 0], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == 'TT-TT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 1, 0], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == 'TTT-T' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTTT', 'TTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 1], dtype=int)) assert seq1 == 'TTTTT' assert seq2 == 'TTTT-' assert round(score, 3) == round(100 * 4 / 5.0, 3)
def test_global_align_gap_incentive_s2(): """Test the global_align gap incentives for gaps in sequence 2 (the second sequence).""" seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [1, 0, 0, 0, 0, 0], dtype=int)) assert seq1 == 'ATT-A' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 1, 0, 0, 0, 0], dtype=int)) assert seq1 == 'A-TTA' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 1, 0, 0, 0], dtype=int)) assert seq1 == 'AT-TA' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 1, 0, 0], dtype=int)) assert seq1 == 'ATT-A' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 1, 0], dtype=int)) assert seq1 == 'ATT-A' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('ATTA', 'ATTTA', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 0, 1], dtype=int)) assert seq1 == 'ATT-A' assert seq2 == 'ATTTA' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [1, 0, 0, 0, 0, 0], dtype=int)) assert seq1 == '-TTTT' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 1, 0, 0, 0, 0], dtype=int)) assert seq1 == 'T-TTT' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 1, 0, 0, 0], dtype=int)) assert seq1 == 'TT-TT' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 1, 0, 0], dtype=int)) assert seq1 == 'TTT-T' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 1, 0], dtype=int)) assert seq1 == 'TTTT-' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3) seq1, seq2, score = CRISPResso2Align.global_align('TTTT', 'TTTTT', matrix=ALN_MATRIX, gap_incentive=np.array( [0, 0, 0, 0, 0, 1], dtype=int)) assert seq1 == 'TTTT-' assert seq2 == 'TTTTT' assert round(score, 3) == round(100 * 4 / 5.0, 3)