def split(inp_fn, out_nm): inp_fn_numlines = util.line_count(inp_fn) num_splits = 60 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 # print 'Using split size %s' % (split_size) split_num = 0 timer = util.Timer(total=num_splits) for idx in range(1, inp_fn_numlines, split_size): start = idx end = start + split_size out_fn = out_dir + out_nm + '_%s.fq' % (split_num) skip = False if os.path.isfile(out_fn): size_mb = os.path.getsize(out_fn) / 1e6 if size_mb > 0: skip = True if not skip: command = 'tail -n +%s %s | head -n %s > %s' % ( start, inp_fn, end - start, out_fn) subprocess.check_output(command, shell=True) split_num += 1 # print(command) timer.update() return
def split(inp_fn, out_nm): #print inp_fn inp_fn_numlines = util.line_count(inp_fn) #print out_nm #print inp_fn num_splits = 15 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 #print 'Using split size %s' % (split_size) split_num = 0 for idx in range(1, inp_fn_numlines, split_size): start = idx end = start + split_size out_fn = out_dir + out_nm + '_%s.fastq' % (split_num) command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn, end - start, out_fn) split_num += 1 print command return
def predict(inp_fn): # Calculate statistics on df, saving to alldf_dict # Deletion positions _predict2.init_model(run_iter='aay', param_iter='aae') df_buffer = init_df_buffer() df_buffer_nm = '' timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): if i % 2 == 0: header = line.strip() if df_buffer_nm == '': df_buffer_nm = header if i % 2 == 1: sequence = line.strip() if len(sequence) < 60: continue df_buffer = add_del_profiles(header, sequence, df_buffer) print len(df_buffer) if len(df_buffer) > 100000: flush_df_buffer(df_buffer, df_buffer_nm) df_buffer_nm = '' df_buffer = init_df_buffer() timer.update() return
def demultiplex(split): inp_fn = inp_dir + '%s.fq' % (split) for name in list(exp_design['Name']) + ['other']: util.ensure_dir_exists(out_dir + name) util.exists_empty_fn(out_dir + name + '/%s.fa' % (split)) lc = util.line_count(inp_fn) num_bad_q, num_tot = 0, 0 timer = util.Timer(total = lc) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() if i % 4 == 1: read = line.strip() if i % 4 == 3: num_tot += 1 qs = line.strip() quals = [ord(s)-33 for s in qs] if np.mean(quals) < 30: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(read, header) out_fn = out_dir + '%s/%s.fa' % (demultiplex_id, split) with open(out_fn, 'a') as f: f.write('>' + header[1:] + '\n' + trimmed_read + '\n') timer.update() print 'Rejected %s fraction of reads' % (num_bad_q / num_tot) return
def split_by_lines(inp_dir): # Splits a folder into groups by lines within each file # Used for scripts that operate line-by-line on all files for fn in os.listdir(inp_dir): if fnmatch.fnmatch(fn, _parallel_config.REGEX_FILTER): nl = util.line_count(inp_dir + fn) jump = nl / _parallel_config.SPLITS jump = (jump / _parallel_config.LINES_DIVISOR ) * _parallel_config.LINES_DIVISOR for i in range(_parallel_config.SPLITS): if i < _parallel_config.SPLITS - 1: arg = str(jump * i + 1) + ',' + \ str(jump * (i + 1)) + 'p;' + \ str(jump * (i + 1) + 1) + 'q' else: arg = str(jump * i + 1) + ',' + \ str(nl) + 'p' # sed grabs a range of lines in a file subprocess.call('sed -n \'' + arg + '\' ' + inp_dir + \ fn + ' > ' + inp_dir + 'split' + str(i) + '/' + fn, shell = True) return
def call_mutations(nm): inp_fn = inp_dir + f'{nm}.sam' mut_dd = defaultdict(list) n_d = defaultdict(lambda: 0) n_d2 = defaultdict(list) timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): timer.update() if line[0] == '@': continue w = line.split() sam = { 'read_nm': w[0], 'target': w[2], '1-based pos': int(w[3]), 'cigar': w[5], 'seq': w[9], } if sam['target'] != 'SP055-rpoZ-cMyc-Cry1Ac1-d123': continue if sam['cigar'] == '*': continue # Call mutation and Track total readcount per position add_mutations(mut_dd, n_d, n_d2, sam) mut_df = pd.DataFrame(mut_dd) mut_df.to_csv(out_dir + f'{nm}.csv') n_dd = defaultdict(list) for pos in range(len(ref)): n_dd['Position (0 based)'].append(pos) n_dd['Read count'].append(n_d[pos]) n_df = pd.DataFrame(n_dd) n_df.to_csv(out_dir + f'{nm}_readcounts.csv') ndf2 = pd.DataFrame(n_d2) ndf2.to_csv(out_dir + f'{nm}_read_idxs.csv') ''' Important note on ndf2: - Many paired reads appear to have sequenced the same molecule. Mutations observed on paired reads are combined; overlapping paired reads are also expected to be combined. This is done in ill_b2_merge_n_paired_reads.py ''' return
def run_align_needleman_wunsch(srr, nm): inp_fn = inp_dir + f'{srr}.fastq' genome_fn = inp_dir + 'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa' target = open(genome_fn).readlines()[1].strip() seq_align_tool = '/ahg/regevdata/projects/CRISPR-libraries/tools/seq-align/bin/needleman_wunsch' out_fn = out_dir + f'{nm}.fa' with open(out_fn, 'w') as f: pass alignment_buffer = [] timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() read_nm = header.split()[0].replace('@', '') if i % 4 == 1: read = line.strip() if i % 4 == 3: qs = [ord(s) - 33 for s in line.strip()] if np.mean(qs) >= 30: read = compbio.reverse_complement(read) command = f'{seq_align_tool} --match 1 --mismatch -1 --gapopen -5 --gapextend -1 --freestartgap --freeendgap {read} {target}' align = subprocess.check_output(command, shell=True).decode('utf-8') align = align[:-2] alignment_buffer.append(f'>{read_nm}\n{align}\n') if len(alignment_buffer) > 100: print(f'Dumping alignment buffer...') with open(out_fn, 'a') as f: for item in alignment_buffer: f.write(item) alignment_buffer = [] timer.update() print(f'Dumping alignment buffer...') with open(out_fn, 'a') as f: for item in alignment_buffer: f.write(item) alignment_buffer = [] return
def convert_sam_to_text(ref, sample_id): inp_fn = inp_dir + f'{ref}/{sample_id}.sam' ref_fn = _config.DATA_DIR + f'{ref}.fa' ref_seq = open(ref_fn).readlines()[-1].strip() # Parse SAM mut_dd = defaultdict(list) nd = {idx: 0 for idx in range(len(ref_seq))} timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): timer.update() if line[0] == '@': continue w = line.split() sam = { 'read_nm': w[0], 'target': w[2], '1-based pos': int(w[3]), 'cigar': w[5], 'seq': w[9], 'qs': w[10], } if sam['cigar'] == '*': continue get_alignment(mut_dd, sam, nd, ref_seq) mut_df = pd.DataFrame(mut_dd) ref_out_dir = out_dir + f'{ref}/' util.ensure_dir_exists(ref_out_dir) mut_df.to_csv(ref_out_dir + f'{sample_id}.csv') ndd = { 'Position (0 based)': list(nd.keys()), 'Read count': list(nd.values()), } ndf = pd.DataFrame(ndd) ndf.to_csv(ref_out_dir + f'n_{sample_id}.csv') return
def convert_alignment(srr_id, out_dir): print srr_id if srr_id not in _config.d.RUNS_SET: return 'Bad srr_id %s' % (srr_id) sam_fn = _config.d.sam_fn(srr_id) genome_build, exp_chrm, exp_pos = get_expected_chrm_pos(srr_id) num_aligns, num_distant = 0, 0 align_collection = defaultdict(lambda: 0) timer = util.Timer(total=util.line_count(sam_fn)) with open(sam_fn) as f: for i, line in enumerate(f): if not line.startswith('@'): num_aligns += 1 chrm = line.split()[2] start = int(line.split()[3]) cigar = line.split()[5] read = line.split()[9] if abs(exp_pos - start) > 1000: num_distant += 1 continue align_len = get_align_len(read, cigar) genome = query_genome(genome_build, chrm, start, align_len, srr_id) align = construct_align(read, genome, cigar, start) align_collection[align] += 1 timer.update() sorted_aligns = sorted(align_collection, key=align_collection.get, reverse=True) out_fn = out_dir + '%s.txt' % (srr_id) with open(out_fn, 'w') as f: for align in sorted_aligns: count = align_collection[align] f.write('>%s_%s' % (count, align)) print '%s distant out of %s alignments: %s' % (num_distant, num_aligns, num_distant / num_aligns) print 'Done' return
def find_cutsites_and_predict(inp_fn, data_nm, split): # Calculate statistics on df, saving to alldf_dict # Deletion positions _predict.init_model(run_iter='aax', param_iter='aag') dd = defaultdict(list) dd_shuffled = defaultdict(list) if data_nm == 'exons': df_out_dir = exon_dfs_out_dir elif data_nm == 'introns': df_out_dir = intron_dfs_out_dir num_flushed = 0 timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): if i % 2 == 0: header = line.strip() if i % 2 == 1: sequence = line.strip() if len(sequence) < 60: continue if len(sequence) > 500000: continue bulk_predict(header, sequence, dd, dd_shuffled, df_out_dir) dd, dd_shuffled, num_flushed = maybe_flush( dd, dd_shuffled, data_nm, split, num_flushed) if (i - 1) % 50 == 0 and i > 1: print '%s pct, %s' % (i / 500, datetime.datetime.now()) timer.update() maybe_flush(dd, dd_shuffled, data_nm, split, num_flushed, force=True) return
def count_grna(exp, lib, split): reads_fn = inp_dir + exp + '/%s.fa' % (split) # Handle potential duplicates in designed gRNAs by placing counts only in the first occurrence grna_set = set(lib['gRNA sequence']) grna_list = list(lib['gRNA sequence']) idxs = dict() for grna in grna_set: idxs[grna] = grna_list.index(grna) # Init list to be joined to lib dataframe in same order as gRNA sequence counts = [0] * len(grna_list) tot = 0 num_reads_matched = 0 timer = util.Timer(total=util.line_count(reads_fn)) with open(reads_fn) as f: for i, line in enumerate(f): if i % 2 == 0: header = line.strip() else: read = line.strip() matched_grna = find_grna(exp, grna_set, read) if matched_grna is not False: counts[idxs[matched_grna]] += 1 num_reads_matched += 1 tot += 1 timer.update() try: pct_reads_matched = float(num_reads_matched) / tot except ZeroDivisionError: pct_reads_matched = np.nan print num_reads_matched, '/', tot print pct_reads_matched * 100, '%', ' reads matched' return counts
def divide(): inp_fn = inp_dir + 'SHE2655.fq' inp_fn_numlines = util.line_count(inp_fn) num_splits = 60 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 print 'Using split size %s' % (split_size) split_num = 0 for idx in range(1, inp_fn_numlines, split_size): start = idx end = start + split_size out_fn = out_dir + '%s.fq' % (split_num) command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn, end - start, out_fn) split_num += 1 print command return
def matchmaker(nm, split): print nm, split stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_r2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 tot_reads = util.line_count(inp_fn) timer = util.Timer(total = tot_reads) from itertools import izip with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s)-33 for s in q2] if np.mean(qs) < 28: qf += 1 continue l2 = compbio.reverse_complement(l2) align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100)) ) outf.write('Quality filtered pct: %s\n' % (qf / (i/4))) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return
def matchmaker(nm, split): print(split) stdout_fn = _config.SRC_DIR + f'nh_c_{nm}_{split}.out' util.exists_empty_fn(stdout_fn) out_dir = f'{out_place}{nm}/{split}/' util.ensure_dir_exists(out_dir) # Parse condition-specific settings exp_row = exp_design[exp_design['Name'] == nm].iloc[0] parent_fn = exp_row['Parent file'] lib_nm = exp_row['Library'] target_nm = exp_row['Target'] # Library design global lib_design lib_design = pd.read_csv(_config.DATA_DIR + f'lib_{lib_nm}_design.csv') global prefixes global peptide_nms global prefix_to_peptide global suffixes global suffix_to_peptide prefixes = [s[:prefix_len] for s in lib_design['Sequence']] peptide_nms = list(lib_design['Name']) prefix_to_peptide = {prefix: nm for prefix, nm in zip(prefixes, peptide_nms)} suffixes = [compbio.reverse_complement(s[-suffix_len:]) for s in lib_design['Sequence']] suffix_to_peptide = {suffix: nm for suffix, nm in zip(suffixes, peptide_nms)} # Target target_row = target_design[target_design['Target'] == target_nm].iloc[0] target = target_row['Sequence'] target_strand = target_row['gRNA orientation'] zf_split = str(split).zfill(3) read1_fn = inp_dir + f'{parent_fn}_R1_{zf_split}.fq' read2_fn = inp_dir + f'{parent_fn}_R2_{zf_split}.fq' count_stats = defaultdict(lambda: 0) count_stats['Success'] = 0 alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir, peptide_nms) tot_lines = util.line_count(read1_fn) timer = util.Timer(total = tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: read1 = line1.strip() read2 = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() count_stats['Read count'] += 1 qs = [ord(s)-33 for s in q1 + q2] if np.mean(qs) < 25: count_stats['1a. Quality fail'] += 1 continue res, msg = find_peptide1_nm(read2) if res is None: count_stats[f'2{msg}'] += 1 continue p1_nm = res res, msg = find_peptide2_nm(read1) if res is None: count_stats[f'2{msg}'] += 1 continue p2_nm = res peptide_nm = f'{p1_nm}-{p2_nm}' read1 = read1[6:] q1 = q1[6:] if target_strand == '-': read1 = compbio.reverse_complement(read1) q1 = q1[::-1] # Run alignment and store in buffer align_header = f'>1' align = alignment(read1, target) store_alignment(alignment_buffer, peptide_nm, align_header, align, q1) count_stats['Success'] += 1 # flush_interval = 2000 flush_interval = 200 if i % int(tot_lines / flush_interval) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write(f'Time: {datetime.datetime.now()}\n') outf.write(f'Progress: {i / int(tot_lines / 100)}\n') outf.write(f'Line: {i}\n') for key in sorted(list(count_stats.keys())): outf.write(f'{key}, {count_stats[key]}\n') # break timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) stats_df = pd.DataFrame(count_stats, index = [0]) sorted_cols = sorted([s for s in stats_df.columns]) stats_df = stats_df[sorted_cols] stats_df.to_csv(out_dir + f'stats_{nm}_{split}.csv') return
def matchmaker(nm, split): print(nm, split) stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) read1_fn = inp_dir + '%s_R1_%s.fq' % (nm, split) read2_fn = inp_dir + '%s_R2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) num_bad_matches = 0 quality_pass = 0 tot_lines = util.line_count(read1_fn) timer = util.Timer(total=tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: # RC of l1 contains target line1 = line1.strip() target_read = compbio.reverse_complement(line1[:61]) ulmi, ulmi_idx = find_ulmi(line1) # l2 contains gRNA grna_read = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() read_q = q1[:61][::-1] ulmi_q = q1[ulmi_idx:ulmi_idx + len(ulmi)][::-1] grna_q = q2[18:22 + 20] qs = [ord(s) - 33 for s in read_q + ulmi_q + grna_q] if np.mean(qs) >= 28: quality_pass += 1 align_header = '>1_%s_%s' % (ulmi, ulmi_q) # Try to find designed target from LSH cand_idxs = find_best_designed_target( target_read, lsh_dict) if len(cand_idxs) > 0: bad_match = compare_target_to_grna( cand_idxs, grna_read) if bad_match == 'ok': # Run alignment and store in buffer best_idx, align = alignment(target_read, cand_idxs) if align is None: continue store_alignment(alignment_buffer, best_idx, align_header, align, read_q) else: num_bad_matches += 1 else: num_bad_matches += 1 if i % int(tot_lines / 200) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_lines / 100))) outf.write('Num. mismatched gRNA/target pairs: %s\n' % (num_bad_matches)) outf.write('Frac. mismatched gRNA/target pairs: %s\n' % (num_bad_matches / quality_pass)) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return
def matchmaker(nm, split): ##CUSTOM CODE FOR DICTIONARY CREATION from Bio import pairwise2 from Bio.pairwise2 import format_alignment from Bio.Seq import Seq from Bio.Alphabet import generic_dna def rc(inp): d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} return "".join([d[e] for e in inp.strip()[::-1]]) #UNSPLICED DATA PROCESSING READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg" READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca" r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper()) r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper()) def quality(line): q_1 = line.strip() qs = [ord(s) - 33 for s in q_1] return np.mean(qs) i = -1 qc_rejection_count = 0 read1_rejection_count = 0 constant_region_rejection_count = 0 accepted_count = 0 nolib_rejection_count = 0 print nm, split #fq_unspliced_1 = open("/cluster/bh0085/prj/exons/data/{0}_1_sequence.fastq".format(nm)) #fq_unspliced_2 = open("/cluster/bh0085/prj/exons/data/{0}_2_sequence.fastq".format(nm)) stdout_fn = _config.SRC_DIR + 'b3_status_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split) inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split) lsh_dict = build_targets_better_lsh() umis_alignments_buffer = init_umis_alignments_buffer() short_outputs = [] prepare_outfns(out_dir) qf = 0 print inp_fn1 tot_reads = util.line_count(inp_fn1) timer = util.Timer(total=tot_reads) i = -1 print "OPENING FILES" with open(inp_fn1) as f1: with open(inp_fn2) as f2: while 1: i += 1 try: r2_l = f2.next() r1_l = f1.next() except StopIteration as e: break if i % 4 == 1: read1 = r1_l read2 = r2_l if i % 4 == 3: if quality(r2_l) < 28 or quality(r1_l) < 28: qc_rejection_count += 1 continue r1_library_constant = "TACCAGCTGCCCTCGTCGAC".upper() r1_library_start = len(r1_library_constant) r1_library_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNN" r1_library_intron_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAG" r1_library_ag_pos = len( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") r1_library_exon_format = "NNNNNNNNNNNNNNNNNNNNNNNN" r2_library_constant = "ggggtgttctgctggtagtggtc".upper() r2_library_start = len(r2_library_constant) r2_umi_format = "NNNNNNNNNNNNNNN" try: a1_offset = read1.upper().index( r1_library_constant.upper()) except ValueError, e: a1_offset = None try: a2_offset = read2.upper().index(r2_library_constant) except ValueError, e: a2_offset = None if a1_offset is None or a2_offset is None: constant_region_rejection_count += 1 continue read1_const = read1[a1_offset:a1_offset + r1_library_start] read1_content = read1[ a1_offset + r1_library_start:][:len(r1_library_format)] read1_extended_content = read1[a1_offset + r1_library_start:] read2_const = read1[a2_offset:a2_offset + r2_library_start] read2_content = read2[a2_offset + r2_library_start:][:len(r2_umi_format )] read2_extended_content = read1[a2_offset + r2_library_start:] r1_ag = read1_content[len(r1_library_intron_format) - 2:len(r1_library_intron_format)] #check to see that the splice acceptor is in the right position #and that the read1 constant sequence aligned #if a1_tag_score <20: # read1_rejection_count+=1 # continue tag = "TACCANCTGCCCTCGTCGAC" umi = read2_content[:len(r2_umi_format)] lib = read1_content[:len(r1_library_format)] lib_extended = read1_extended_content[:len( r1_library_format) + 20] if umi.count("N") != 0 or lib.count("N") != 0: continue #no longer check for perfect matches. Just align exp = target_names.get(lib, None) cand_idxs = find_best_designed_target(lib, lsh_dict) if len(cand_idxs) == 0: print "rejecting for no good match" nolib_rejection_count += 1 continue best_idx = cand_idxs[0] #extends a target alignment region to include an extra 20 bases to anchor the alignment for long r1 deletions target_alignment_region = names_targets[ best_idx] + "tgattacacatatagacacg".upper() align = pairwise2.align.localms(target_alignment_region, read1_extended_content, 2, -1, -5, -.1)[0] output_complete = """>1\n{0}\n{1}\n{2}\n{3}\n""".format( umi, best_idx, align[2], "\n".join(format_alignment(*align).splitlines()[:3])) output_short = (umi, best_idx) umis_alignments_buffer[umi].append(output_complete) short_outputs.append(output_short) accepted_count += 1 if i % int(tot_reads / 1000) < 4 and i > 1: print i print "FLUSHING!" # Flush alignment buffer flush_tuples(umis_alignments_buffer, out_dir) print len(umis_alignments_buffer.keys()) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) outf.write( "accepted {0}, rejected {1} bad read1, {2} bad lib\n" .format(accepted_count, read1_rejection_count, nolib_rejection_count)) timer.update()
def matchmaker(nm, split): read_constant_rejection_count = 0 qc_rejection_count = 0 accepted_count = 0 grna_failure_count = 0 read1_rejection_count = 0 ##CUSTOM CODE FOR DICTIONARY CREATION from Bio import pairwise2 from Bio.pairwise2 import format_alignment from Bio.Seq import Seq from Bio.Alphabet import generic_dna def rc(inp): d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} return "".join([d[e] for e in inp.strip()[::-1]]) #UNSPLICED DATA PROCESSING READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg" READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca" r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper()) r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper()) def quality(line): q_1 = line.strip() qs = [ord(s) - 33 for s in q_1] return np.mean(qs) i = -1 print nm, split umis_alignments_buffer = init_umis_alignments_buffer() stdout_fn = _config.SRC_DIR + 'b7_status_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split) inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split) short_outputs = [] prepare_outfns(out_dir) qf = 0 tot_reads = util.line_count(inp_fn1) timer = util.Timer(total=tot_reads) #raise Exception() i = -1 with open(inp_fn1) as f1: with open(inp_fn2) as f2: while 1: i += 1 try: r2_l = f2.next() r1_l = f1.next() except StopIteration as e: break if i % 4 == 1: read1 = r1_l read2 = r2_l if i % 4 == 3: if quality(r2_l) < 28 or quality(r1_l) < 28: qc_rejection_count += 1 continue print read1 print read2 print len(read2) r1_grna19_format = "N" * 19 r1_grna20_format = "N" * 20 r2_umi_format = "N" * 15 r1_prefix_constant = "GACGAAACACCG".upper() r1_grna_start = len(r1_prefix_constant) r2_prefix_constant = "tcaaacaggacggcagcgtgcagctcgcc".upper( ) r2_umi_start = len(r2_prefix_constant) r2_umi_format = "N" * 15 r2_post_umi_format = "gaccactaccagcagaacacccc".upper() print "working" try: print r1_prefix_constant a1_offset = read1.upper().index( r1_prefix_constant.upper()) except Exception, e: read1_rejection_count += 1 a1_offset = None print "A1 EXCEPTION" continue try: a2_offset = read2.upper().index( r2_prefix_constant.upper()) except Exception, e: a2_offset = None read_constant_rejection_count += 1 print "A2 REJECTION" continue read1_grna19 = read1[a1_offset + r1_grna_start:][:len(r1_grna19_format )] read1_grna20 = read1[a1_offset + r1_grna_start:][:len(r1_grna20_format )] read2_umi_content = read2[a2_offset + r2_umi_start:][:len(r2_umi_format )] print a2_offset print r2_umi_start print len(r2_umi_format) print len(read2_umi_content) #raise Exception() design_row = exp_design.loc[exp_design[ "Designed gRNA (NGG orientation, 19 and 20)"] == read1_grna20] if len(design_row) == 0: design_row = exp_design.loc[exp_design[ "Designed gRNA (NGG orientation, 19 and 20)"] == read1_grna19] if len(design_row) == 0: grna_failure_count += 1 continue design_row = design_row.iloc[0] output_complete = """>1\n{0}\n{1}""".format( read2_umi_content, design_row["Identifier number"]) output_short = (read2_umi_content, design_row["Identifier number"]) print output_short umis_alignments_buffer[read2_umi_content].append( output_complete) short_outputs.append(output_short) accepted_count += 1 if i % int(tot_reads / 10) < 4 and i > 1: print "FLUSHING!" print accepted_count # Flush alignment buffer flush_tuples(umis_alignments_buffer, out_dir) print len(umis_alignments_buffer.keys()) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) outf.write( "accepted {0}, rejected {1} bad read1\n{2} rc rejection\n" .format(accepted_count, read1_rejection_count, read_constant_rejection_count)) timer.update()