def remaster_aligns(inp_fn, data): with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() # read start is 1-based, left-most genomic position of reference in alignment, according to SAM format. read_start = int(header.split('_')[-1]) if i % 4 == 1: read = line.strip() if i % 4 == 2: genome = line.strip() genome = genome.upper() # read_start is where reference starts. find expected cutsite by counting along reference until we reach master_expected_cutsite global expected_cutsite # 1-based expected_cutsite = master_expected_cutsite - read_start if expected_cutsite <= 0 or expected_cutsite >= len(genome.replace('-', '')): continue # if we reversed this context in our library, reverse the alignment here. adjust expected cutsite accordingly. if reverse_flag: read = compbio.reverse_complement(read) genome = compbio.reverse_complement(genome) expected_cutsite = len(genome.replace('-', '')) - expected_cutsite # Find where gg should be even if there are insertions in between gg_seq = genome[expected_cutsite:].replace('-', '') cutsite = get_cutsite_idx(read, genome) gg = genome[cutsite:].replace('-', '')[4:6] # Assert that cutsite to GG must be in genome if len(gg) != 2: continue assert gg == 'GG', 'No GG!' # Ensure that cutsite is determined in a consistent manner between normal and reversed alignments assert genome[cutsite - 1] != '-', 'Inconsistent cutsite!' # Main -- Find indel category, assuming end gaps are meaningless category = categorize_alignment(read, genome) if category in ['del_notatcut', 'del_notcrispr']: read, genome, category = shift_single_deletion(read, genome, category) if category in ['ins_notatcut', 'ins_notcrispr']: read, genome, category = shift_single_insertion(read, genome, category) header += '_%s' % (cutsite) alignment = [header, read, genome, ''] data[category] += alignment return
def nts_to_aas(seq_30nt, aa_frame, path_pos_wrt_grna, aa_strand_relative_to_seq): if aa_frame == 'intronic': return '' path_idx = path_pos_wrt_grna + 9 if aa_strand_relative_to_seq == '-': seq_30nt = compbio.reverse_complement(seq_30nt) path_idx = len(seq_30nt) - path_idx - 1 # aa_frame in [1, 2, 3] -> [0, 1, 2] aa_frame_0idx = int(aa_frame) - 1 begin_frame = (aa_frame_0idx - path_idx) % 3 first_triplet_start_idx = (3 - begin_frame) % 3 aas = '' for idx in range(first_triplet_start_idx, len(seq_30nt) + 3, 3): triplet = seq_30nt[idx:idx + 3] if len(triplet) != 3: break aa = triplet_to_aa[triplet] # print(triplet, aa, idx) aas += aa return aas
def create_gt_with_mutations(dfs): plus_strand = bool(dfs['Target strand'].iloc[0] == 0) gt = list(wt_gt) if plus_strand else list(rc_wt_gt) for idx, row in dfs.iterrows(): try: assert gt[row['Position']] == row['Reference nucleotide'], 'Error: Probably bad strand' except: import code; code.interact(local=dict(globals(), **locals())) gt[row['Position']] = row['Mutated nucleotide'] gt = ''.join(gt) return gt if plus_strand else compbio.reverse_complement(gt)
def find_matching_sequence(target, rows): for idx, row in rows.iterrows(): orient = row['gRNA Orientation'] seq = row['Alternative Sequence'] cutsite = row['Cutsite'] if orient == '-': seq = compbio.reverse_complement(seq) cutsite = len(seq) - cutsite cons_target = seq[cutsite - 27:cutsite + 28] if target == cons_target: return row assert False, 'Not found' return
def run_align_needleman_wunsch(srr, nm): inp_fn = inp_dir + f'{srr}.fastq' genome_fn = inp_dir + 'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa' target = open(genome_fn).readlines()[1].strip() seq_align_tool = '/ahg/regevdata/projects/CRISPR-libraries/tools/seq-align/bin/needleman_wunsch' out_fn = out_dir + f'{nm}.fa' with open(out_fn, 'w') as f: pass alignment_buffer = [] timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() read_nm = header.split()[0].replace('@', '') if i % 4 == 1: read = line.strip() if i % 4 == 3: qs = [ord(s) - 33 for s in line.strip()] if np.mean(qs) >= 30: read = compbio.reverse_complement(read) command = f'{seq_align_tool} --match 1 --mismatch -1 --gapopen -5 --gapextend -1 --freestartgap --freeendgap {read} {target}' align = subprocess.check_output(command, shell=True).decode('utf-8') align = align[:-2] alignment_buffer.append(f'>{read_nm}\n{align}\n') if len(alignment_buffer) > 100: print(f'Dumping alignment buffer...') with open(out_fn, 'a') as f: for item in alignment_buffer: f.write(item) alignment_buffer = [] timer.update() print(f'Dumping alignment buffer...') with open(out_fn, 'a') as f: for item in alignment_buffer: f.write(item) alignment_buffer = [] return
def create_gt_with_mutations(dfs): plus_strand = True gt = list(wt_gt) if plus_strand else list(rc_wt_gt) for idx, row in dfs.iterrows(): try: assert gt[row['Position (0 based)']] == row[ 'Reference nt'], 'Error: Probably bad strand' except: import code code.interact(local=dict(globals(), **locals())) gt[row['Position (0 based)']] = row['Mutated nt'] gt = ''.join(gt) return gt if plus_strand else compbio.reverse_complement(gt)
def find_ulmi(line1): constant = 'ATGACGCGTCGCACCCATC' def get_match_score(query, ref): return sum([1 for idx in range(len(query)) if query[idx] != ref[idx]]) ulmi_idx = 61 + len(constant) best_ulmi = line1[ulmi_idx:ulmi_idx + 15] for start_pos in range(61 - 5, 61 + 5): query = line1[start_pos:start_pos + 19] if get_match_score(query, constant) <= 3: best_ulmi = line1[ulmi_idx:ulmi_idx + 15] ulmi_idx = start_pos + len(constant) break return compbio.reverse_complement(best_ulmi), ulmi_idx
def set_master_expected_cutsite(srr_id): global master_expected_cutsite T = _config.d.TABLE srr_row = T[T['Run'] == srr_id] if len(srr_row) == 0: return False, None cloc = str(srr_row['chromosome_loc']).split()[1] genome_build = str(cloc.split('_')[0]) cloc = cloc.split('_')[1] chrm = str(cloc.split(':')[0]) start = int(cloc.split(':')[1].split('-')[0]) end = int(cloc.split(':')[1].split('-')[1]) tool = '/cluster/mshen/tools/2bit/twoBitToFa' twobit_db = '/cluster/mshen/tools/2bit/%s.2bit' % (genome_build) twobit_start = start - 1 command = '%s -seq=%s -start=%s -end=%s %s temp_%s.fa; cat temp_%s.fa' % ( tool, chrm, twobit_start, end, twobit_db, srr_id, srr_id) query = subprocess.check_output(command, shell=True) genome = ''.join(query.split()[1:]).upper() reverse_context = False if genome[:2] == 'CC' and genome[-2:] != 'GG': master_expected_cutsite = start + 6 reverse_context = True elif genome[:2] != 'CC' and genome[-2:] == 'GG': master_expected_cutsite = start + 23 - 6 elif genome[:2] == 'CC' and genome[-2:] == 'GG': # If both CC and GG are available, default to GG. # Three out of 96 spacers have both CC/GG, all three are GG. master_expected_cutsite = start + 23 - 6 else: print 'ERROR: Expected gRNA lacks NGG on both strands' sys.exit(0) context = '' command = '%s -seq=%s -start=%s -end=%s %s temp_%s.fa; cat temp_%s.fa' % ( tool, chrm, master_expected_cutsite - 101, master_expected_cutsite + 99, twobit_db, srr_id, srr_id) query = subprocess.check_output(command, shell=True) context = ''.join(query.split()[1:]).upper() if reverse_context: context = compbio.reverse_complement(context) assert context[104:106] == 'GG', 'Bad GG' return True, context
def make_bt_index(): bt_fold = _config.DATA_DIR + f'bowtie2_index/' util.ensure_dir_exists(bt_fold) for idx, row in target_df.iterrows(): nm = row['Name'] # seq = row['Sequence context'] assembly = row['Assembly'] chrom = row['Chromosome'] strand = row['Strand'] start = row['Start'] end = row['End'] twobit = '/ahg/regevdata/projects/CRISPR-libraries/tools/2bit/twoBitToFa' twobit_ref = f'/ahg/regevdata/projects/CRISPR-libraries/tools/2bit/{assembly}.2bit' # Radius = 1000 needs to be longer than any single read for bowtie2 to work without local alignment command = f'{twobit} -seq={chrom} -start={start - 1001} -end={end + 1000} {twobit_ref} temp.fa; cat temp.fa' seq = subprocess.check_output(command, shell = True).decode('utf-8') seq = ''.join(seq.split()[1:]) seq = seq.upper() if strand == '-': seq = compbio.reverse_complement(seq) try: assert seq.index(row['Spacer (20 nt)']) == 1000 except: print(seq.index(row['Spacer (20 nt)'])) import code; code.interact(local=dict(globals(), **locals())) print(len(seq)) print(nm) ref_fn = _config.DATA_DIR + f'{nm}.fa' with open(ref_fn, 'w') as f: f.write(f'>{nm}\n{seq}\n') bt2_build = f'/ahg/regevdata/projects/CRISPR-libraries/tools/bowtie2-2.3.5.1-linux-x86_64/bowtie2-build' command = f'{bt2_build} {ref_fn} {bt_fold}/{nm}' result = subprocess.check_output(command, shell = True) return
def search_region(nm, spc, chrm, startpos, endpos): startpos, endpos = int(startpos), int(endpos) sq = compbio.get_genomic_seq_twoBitToFa(spc, chrm, str(startpos), str(endpos)) headers, sqs = [], [] timer = util.Timer(total=len(sq)) for j in range(len(sq) - 3): found = False if sq[j:j + 2] == 'GG': found = True start, end = j - 21, j + 2 cut_site = j - 4 orient = '+' if sq[j:j + 2] == 'CC': found = True start, end = j, j + 23 cut_site = j + 5 orient = '-' if found: s = sq[start:end] if len(s) != 23: continue if orient == '+': pass if orient == '-': start, end = end, start s = compbio.reverse_complement(s) if s[0] != 'G' and s[1] == 'G': s = s[1:] elif s[0] != 'G' and s[1] != 'G': s = 'G' + s hdr = '>' + '__'.join([ nm, chrm, str(startpos + start), str(startpos + end), str(startpos + cut_site), orient ]) headers.append(hdr) sqs.append(s) timer.update() return headers, sqs
def add_del_profiles(header, sequence, df_buffer): for idx in range(len(sequence)): seq = '' if sequence[idx:idx + 2] == 'CC': cutsite = idx + 6 seq = sequence[cutsite - 30:cutsite + 30] seq = compbio.reverse_complement(seq) if sequence[idx:idx + 2] == 'GG': cutsite = idx - 4 seq = sequence[cutsite - 30:cutsite + 30] if seq != '': if len(seq) != 60: continue local_cutsite = 30 pred_df = _predict2.predict_mhdel(seq, local_cutsite) pred_df['header'] = header pred_df['seq'] = sequence pred_df['pam'] = sequence[idx:idx + 2] pred_df['cutsite'] = cutsite pred_df['shuffled'] = 'no' df_buffer = df_buffer.append(pred_df, ignore_index=True) pre, post = list(seq[:34]), list(seq[36:]) random.shuffle(pre) random.shuffle(post) shuffled_seq = ''.join(pre) + 'GG' + ''.join(post) shuffled_pred_df = _predict2.predict_mhdel(seq, local_cutsite) shuffled_pred_df['header'] = header shuffled_pred_df['seq'] = sequence shuffled_pred_df['pam'] = sequence[idx:idx + 2] shuffled_pred_df['cutsite'] = cutsite shuffled_pred_df['shuffled'] = 'yes' df_buffer = df_buffer.append(shuffled_pred_df, ignore_index=True) return df_buffer
def check_ins_templated(read, genome, is_pos, ins_len): # if the inserted sequence and some of the neighboring sequence is present in the wildtype sequence context, it's templated. def find_all_instances(query, seq): idxs = [] for i in range(len(seq)): if seq[i:i + len(query)] == query: idxs.append(i) return idxs imer = read[is_pos:is_pos + ins_len] designed_genome = genome.replace('-', '') rc_designed_genome = compbio.reverse_complement(designed_genome) if imer not in designed_genome and imer not in rc_designed_genome: return 0, 'na', '', '' # try extending 5' side for idx in range(is_pos - 1, -1, -1): new_imer = read[idx:is_pos + ins_len] if new_imer not in designed_genome and new_imer not in rc_designed_genome: break # Template cannot be only where we are if new_imer in designed_genome and new_imer not in rc_designed_genome: inst = find_all_instances(new_imer, designed_genome) if len(inst) == 1 and idx in inst: break fiveside = idx + 1 # try extending 3' side for idx in range(is_pos + ins_len + 1, len(read)): new_imer = read[fiveside:idx] if new_imer not in designed_genome and new_imer not in rc_designed_genome: break # Template cannot be only where we are if new_imer in designed_genome and new_imer not in rc_designed_genome: inst = find_all_instances(new_imer, designed_genome) if len(inst) == 1 and fiveside in inst: break threeside = idx - 1 fiveside_seq = read[fiveside:is_pos] threeside_seq = read[is_pos + ins_len:threeside] # If no neighboring sequence is included in template, it's not templated. if len(fiveside_seq) == 0 or len(threeside_seq) == 0: return 0, 'na', '', '' template = read[fiveside:threeside] # get p2 and mh2 if template in genome[:is_pos] or template in compbio.reverse_complement( genome[:is_pos].replace('-', '')): p2 = fiveside_seq mh2 = threeside_seq else: p2 = threeside_seq mh2 = fiveside_seq # Get template orientation if template in designed_genome and template not in rc_designed_genome: template_orientation = '+' if template not in designed_genome and template in rc_designed_genome: template_orientation = '-' if template in designed_genome and template in rc_designed_genome: template_orientation = 'both' # a random 5mer occurs in 55 bp at 5% rate. To threshold at various false positive rates, defer decision and just return length of longest template match. return len(template), template_orientation, p2, mh2
def search_region(nm, spc, chrm, startpos, endpos, RepeatMasker): startpos, endpos = int(startpos), int(endpos) sq = compbio.get_genomic_seq_twoBitToFa(spc, chrm, str(startpos), str(endpos)) dists = [] headers, sqs = [], [] prev, too_close_filtered = 0, 0 num_repeats_found = 0 timer = util.Timer(total=len(sq)) for j in range(len(sq) - 3): found = False if sq[j:j + 2] == 'GG': found = True start, end = j - 21, j + 2 cut_site = j - 4 orient = '+' if sq[j:j + 2] == 'CC': found = True start, end = j, j + 23 cut_site = j + 5 orient = '-' if found: # filter grnas that are too close if cut_site - prev < _config.d.MIN_DIST: too_close_filtered += 1 continue # filter incomplete grnas s = sq[start:end] if len(s) != 23: continue # filter repeats if RepeatMasker.search(chrm, startpos + j - 10, startpos + j + 10): num_repeats_found += 1 continue # if all is ok if orient == '+': pass if orient == '-': start, end = end, start s = compbio.reverse_complement(s) # G-N19-NGG, G-N18-NGG, N20-NGG if s[0] != 'G' and s[1] == 'G': s = s[1:] elif s[0] != 'G' and s[1] != 'G': s = 'G' + s hdr = '>' + '__'.join([ nm, chrm, str(startpos + start), str(startpos + end), str(startpos + cut_site), orient ]) if hdr not in headers: headers.append(hdr) sqs.append(s) dists.append(cut_site - prev) prev = cut_site timer.update() return headers, sqs, too_close_filtered, num_repeats_found, dists
def bulk_predict(header, sequence, dd, dd_shuffled, df_out_dir): # Input: A specific sequence # Find all Cas9 cutsites, gather metadata, and run inDelphi try: ans = parse_header(header) gene_kgid, chrom, start, end = ans except: return for idx in range(len(sequence)): seq = '' if sequence[idx:idx + 2] == 'CC': cutsite = idx + 6 seq = sequence[cutsite - 30:cutsite + 30] seq = compbio.reverse_complement(seq) orientation = '-' if sequence[idx:idx + 2] == 'GG': cutsite = idx - 4 seq = sequence[cutsite - 30:cutsite + 30] orientation = '+' if seq == '': continue if len(seq) != 60: continue # Sanitize input seq = seq.upper() if 'N' in seq: continue if not re.match('^[ACGT]*$', seq): continue # Randomly query subset for broad shallow coverage r = np.random.random() if r > 0.05: continue # Shuffle everything but GG seq_nogg = list(seq[:34] + seq[36:]) random.shuffle(seq_nogg) shuffled_seq = ''.join(seq_nogg[:34]) + 'GG' + ''.join(seq_nogg[36:]) for d, seq_context, shuffled_nm in zip([dd, dd_shuffled], [seq, shuffled_seq], ['wt', 'shuffled']): # # Store metadata statistics # local_cutsite = 30 grna = seq_context[13:33] cutsite_coord = start + idx unique_id = '%s_%s_hg38_%s_%s_%s' % (gene_kgid, grna, chrom, cutsite_coord, orientation) d['Sequence Context'].append(seq_context) d['Local Cutsite'].append(local_cutsite) d['Chromosome'].append(chrom) d['Cutsite Location'].append(cutsite_coord) d['Orientation'].append(orientation) d['Cas9 gRNA'].append(grna) d['Gene kgID'].append(gene_kgid) d['Unique ID'].append(unique_id) # Make predictions ans = _predict.predict_all(seq_context, local_cutsite, rate_model, bp_model, normalizer) pred_del_df, pred_all_df, total_phi_score, ins_del_ratio = ans # Save predictions # del_df_out_fn = df_out_dir + '%s_%s_%s.csv' % (unique_id, 'dels', shuffled_nm) # pred_del_df.to_csv(del_df_out_fn) # all_df_out_fn = df_out_dir + '%s_%s_%s.csv' % (unique_id, 'all', shuffled_nm) # pred_all_df.to_csv(all_df_out_fn) ## Translate predictions to indel length frequencies indel_len_pred, fs = get_indel_len_pred(pred_all_df) # # Store prediction statistics # d['Total Phi Score'].append(total_phi_score) d['1ins/del Ratio'].append(ins_del_ratio) d['1ins Rate Model'].append(rate_model_nm) d['1ins bp Model'].append(bp_model_nm) d['1ins normalizer'].append(normalizer_nm) d['Frameshift +0'].append(fs['+0']) d['Frameshift +1'].append(fs['+1']) d['Frameshift +2'].append(fs['+2']) d['Frameshift'].append(fs['+1'] + fs['+2']) crit = (pred_del_df['Genotype Position'] != 'e') s = pred_del_df[crit]['Predicted_Frequency'] s = np.array(s) / sum(s) del_gt_precision = 1 - entropy(s) / np.log(len(s)) d['Precision - Del Genotype'].append(del_gt_precision) dls = [] for del_len in range(1, 60): dlkey = -1 * del_len dls.append(indel_len_pred[dlkey]) dls = np.array(dls) / sum(dls) del_len_precision = 1 - entropy(dls) / np.log(len(dls)) d['Precision - Del Length'].append(del_len_precision) crit = (pred_all_df['Genotype Position'] != 'e') s = pred_all_df[crit]['Predicted_Frequency'] s = np.array(s) / sum(s) all_gt_precision = 1 - entropy(s) / np.log(len(s)) d['Precision - All Genotype'].append(all_gt_precision) negthree_nt = seq_context[local_cutsite - 1] negfour_nt = seq_context[local_cutsite] d['-4 nt'].append(negfour_nt) d['-3 nt'].append(negthree_nt) crit = (pred_all_df['Category'] == 'ins') highest_ins_rate = max(pred_all_df[crit]['Predicted_Frequency']) crit = (pred_all_df['Category'] == 'del') & (pred_all_df['Genotype Position'] != 'e') highest_del_rate = max(pred_all_df[crit]['Predicted_Frequency']) d['Highest Ins Rate'].append(highest_ins_rate) d['Highest Del Rate'].append(highest_del_rate) return
def matchmaker(nm, split): print(nm, split) stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) read1_fn = inp_dir + '%s_R1_%s.fq' % (nm, split) read2_fn = inp_dir + '%s_R2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) num_bad_matches = 0 quality_pass = 0 tot_lines = util.line_count(read1_fn) timer = util.Timer(total=tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: # RC of l1 contains target line1 = line1.strip() target_read = compbio.reverse_complement(line1[:61]) ulmi, ulmi_idx = find_ulmi(line1) # l2 contains gRNA grna_read = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() read_q = q1[:61][::-1] ulmi_q = q1[ulmi_idx:ulmi_idx + len(ulmi)][::-1] grna_q = q2[18:22 + 20] qs = [ord(s) - 33 for s in read_q + ulmi_q + grna_q] if np.mean(qs) >= 28: quality_pass += 1 align_header = '>1_%s_%s' % (ulmi, ulmi_q) # Try to find designed target from LSH cand_idxs = find_best_designed_target( target_read, lsh_dict) if len(cand_idxs) > 0: bad_match = compare_target_to_grna( cand_idxs, grna_read) if bad_match == 'ok': # Run alignment and store in buffer best_idx, align = alignment(target_read, cand_idxs) if align is None: continue store_alignment(alignment_buffer, best_idx, align_header, align, read_q) else: num_bad_matches += 1 else: num_bad_matches += 1 if i % int(tot_lines / 200) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_lines / 100))) outf.write('Num. mismatched gRNA/target pairs: %s\n' % (num_bad_matches)) outf.write('Frac. mismatched gRNA/target pairs: %s\n' % (num_bad_matches / quality_pass)) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return
def matchmaker(nm, split): print(split) stdout_fn = _config.SRC_DIR + f'nh_c_{nm}_{split}.out' util.exists_empty_fn(stdout_fn) out_dir = f'{out_place}{nm}/{split}/' util.ensure_dir_exists(out_dir) # Parse condition-specific settings exp_row = exp_design[exp_design['Name'] == nm].iloc[0] parent_fn = exp_row['Parent file'] lib_nm = exp_row['Library'] target_nm = exp_row['Target'] # Library design global lib_design lib_design = pd.read_csv(_config.DATA_DIR + f'lib_{lib_nm}_design.csv') global prefixes global peptide_nms global prefix_to_peptide global suffixes global suffix_to_peptide prefixes = [s[:prefix_len] for s in lib_design['Sequence']] peptide_nms = list(lib_design['Name']) prefix_to_peptide = {prefix: nm for prefix, nm in zip(prefixes, peptide_nms)} suffixes = [compbio.reverse_complement(s[-suffix_len:]) for s in lib_design['Sequence']] suffix_to_peptide = {suffix: nm for suffix, nm in zip(suffixes, peptide_nms)} # Target target_row = target_design[target_design['Target'] == target_nm].iloc[0] target = target_row['Sequence'] target_strand = target_row['gRNA orientation'] zf_split = str(split).zfill(3) read1_fn = inp_dir + f'{parent_fn}_R1_{zf_split}.fq' read2_fn = inp_dir + f'{parent_fn}_R2_{zf_split}.fq' count_stats = defaultdict(lambda: 0) count_stats['Success'] = 0 alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir, peptide_nms) tot_lines = util.line_count(read1_fn) timer = util.Timer(total = tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: read1 = line1.strip() read2 = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() count_stats['Read count'] += 1 qs = [ord(s)-33 for s in q1 + q2] if np.mean(qs) < 25: count_stats['1a. Quality fail'] += 1 continue res, msg = find_peptide1_nm(read2) if res is None: count_stats[f'2{msg}'] += 1 continue p1_nm = res res, msg = find_peptide2_nm(read1) if res is None: count_stats[f'2{msg}'] += 1 continue p2_nm = res peptide_nm = f'{p1_nm}-{p2_nm}' read1 = read1[6:] q1 = q1[6:] if target_strand == '-': read1 = compbio.reverse_complement(read1) q1 = q1[::-1] # Run alignment and store in buffer align_header = f'>1' align = alignment(read1, target) store_alignment(alignment_buffer, peptide_nm, align_header, align, q1) count_stats['Success'] += 1 # flush_interval = 2000 flush_interval = 200 if i % int(tot_lines / flush_interval) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write(f'Time: {datetime.datetime.now()}\n') outf.write(f'Progress: {i / int(tot_lines / 100)}\n') outf.write(f'Line: {i}\n') for key in sorted(list(count_stats.keys())): outf.write(f'{key}, {count_stats[key]}\n') # break timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) stats_df = pd.DataFrame(count_stats, index = [0]) sorted_cols = sorted([s for s in stats_df.columns]) stats_df = stats_df[sorted_cols] stats_df.to_csv(out_dir + f'stats_{nm}_{split}.csv') return
def wildtype_repairs(row): orient = row['gRNA Orientation'] cutsite = row['Cutsite'] seq = row['Alternative Sequence'] wt_seq = row['Reference Sequence'] if orient == '-': seq = compbio.reverse_complement(seq) wt_seq = compbio.reverse_complement(wt_seq) cutsite = len(seq) - cutsite # Detect wildtypes with iterative cutting - expect 0 at these wt_repairable_flag = 'yes' fs_repairable_flag = 'yes' grna = seq[cutsite - 10:cutsite + 3] for wt_seq_s in [wt_seq, compbio.reverse_complement(wt_seq)]: if grna in wt_seq_s: try: pam = wt_seq[wt_seq.index(grna) + 14:wt_seq.index(grna) + 16] except: wt_repairable_flag = 'iterwt' fs_repairable_flag = 'iterwt' continue if pam in ['GG', 'AG', 'GA']: wt_repairable_flag = 'iterwt' fs_repairable_flag = 'iterwt' repair_gts = [] repair_dls = [] longest_wt_mh = -1 longest_nonwt_mh = -1 for del_len in range(1, 27 + 1): for start_pos in range(0, del_len + 1): repair_gt = seq[:cutsite - del_len + start_pos] + seq[cutsite + start_pos:] l = seq[cutsite - del_len:cutsite] r = seq[cutsite:cutsite + del_len] mhs = find_microhomologies(l, r) if repair_gt == wt_seq: repair_gts.append(start_pos) repair_dls.append(del_len) for mh in mhs: if start_pos in mh: mh_len = len(mh) - 1 if mh_len > longest_wt_mh: longest_wt_mh = mh_len else: for mh in mhs: if start_pos in mh: mh_len = len(mh) - 1 if mh_len > longest_nonwt_mh: longest_nonwt_mh = mh_len if len(repair_gts) == 0: wt_repairable_flag = 'no' if longest_wt_mh > longest_nonwt_mh: longest_mh_wt = 'yes' else: longest_mh_wt = 'no' fs = row['Needed Frameshift'] if fs == 0: fs_repairable_flag = 'no' return repair_gts, repair_dls, wt_repairable_flag, fs, fs_repairable_flag, longest_mh_wt
sys.path.append('/home/unix/maxwshen/') import numpy as np from collections import defaultdict from mylib import util, compbio import pandas as pd # Default params inp_dir = _config.OUT_PLACE + f'ill_b2_merge_n_paired_reads/' NAME = util.get_fn(__file__) out_dir = _config.OUT_PLACE + NAME + '/' util.ensure_dir_exists(out_dir) exp_design = pd.read_csv(_config.DATA_DIR + f'Badran2015_SraRunTable.csv') wt_gt = open(_config.DATA_DIR + f'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa').readlines()[1].strip() rc_wt_gt = compbio.reverse_complement(wt_gt) params = { 'num_splits': 10, } ## # Primary ## def merge_n_paired_reads(nm): mdf = pd.DataFrame() for split in range(params['num_splits']): df = pd.read_csv(inp_dir + f'{nm}_{split}_read_idxs.csv', index_col=0) mdf = mdf.append(df, ignore_index=True, sort=False)
def matchmaker(nm, split): print nm, split stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_r2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 tot_reads = util.line_count(inp_fn) timer = util.Timer(total = tot_reads) from itertools import izip with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s)-33 for s in q2] if np.mean(qs) < 28: qf += 1 continue l2 = compbio.reverse_complement(l2) align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100)) ) outf.write('Quality filtered pct: %s\n' % (qf / (i/4))) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return