def main(nm='', start='', end=''): if nm == '' and start == '' and end == '': gen_qsubs() return start, end = int(start), int(end) out_dir = out_place + nm + '/' util.ensure_dir_exists(out_dir) print('Preparing alignment output directories...') prepare_align_outdirs(out_dir, start, end) print('Done') global expected_cutsite expected_cutsite = 30 inp_dir = inp_place + nm + '/' timer = util.Timer(total=end - start + 1) for iter_exp in range(start, end): data = defaultdict(list) for split in os.listdir(inp_dir): if split == 'aligns': continue inp_fn = inp_dir + '%s/%s.txt' % (split, iter_exp) remaster_aligns(inp_fn, data) save_alignments(data, out_dir, iter_exp) timer.update() return
def prepare_align_outdirs(out_plc, start, end): util.ensure_dir_exists(out_plc) timer = util.Timer(total=end - start + 1) for exp in range(start, end + 1): out_idx_dir = out_plc + str(exp) + '/' util.ensure_dir_exists(out_idx_dir) if len(os.listdir(out_idx_dir)) > 0: subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell=True) timer.update() return
def genotype_data(inp_dir, out_dir, nm, start, end): start, end = int(start), int(end) master_df = pd.DataFrame() global crispr_cutsite #expected_cutsite = names_cutsites[iter_exp] -3 #crispr_cutsite = 41; #len('TCCGTGCTGTAACGAAAGGATGGGTGCGACGCGTCAT') + 34 #note, this was changed to the zero based cutsite in reduced.csv timer = util.Timer(total=end - start + 1) for iter_exp in range(start, end): exp = iter_exp crispr_cutsite = 34 exp_dir = '%s%s/' % (inp_dir, iter_exp) if not os.path.isdir(exp_dir): return # Noise categories master_df = get_homopolymer(master_df, exp, exp_dir) master_df = get_hasN(master_df, exp, exp_dir) master_df = get_pcr_recombination(master_df, exp, exp_dir) master_df = get_poormatches(master_df, exp, exp_dir) master_df = get_cutsite_not_sequenced(master_df, exp, exp_dir) master_df = get_read_too_short(master_df, exp, exp_dir) # Primary categories master_df = get_deletions(master_df, exp, exp_dir) master_df = get_insertions(master_df, exp, exp_dir) master_df = get_combination_indels(master_df, exp, exp_dir) # Secondary categories master_df = get_forgiven_indels(master_df, exp, exp_dir) master_df = get_forgiven_combination_indels(master_df, exp, exp_dir) # Other categories master_df = get_combination_indels_notcrispr(master_df, exp, exp_dir) master_df = get_other(master_df, exp, exp_dir) # Wildtypes master_df = get_wildtype(master_df, exp, exp_dir) timer.update() seq_contexts = [] for s in master_df['_Experiment']: crit = (LIBRARY_DF['Name'] == s) if 'Designed sequence (61-bp, cutsite at position 34 by 0-index)' in LIBRARY_DF.columns: seq = LIBRARY_DF[crit][ 'Designed sequence (61-bp, cutsite at position 34 by 0-index)'].iloc[ 0] elif 'targetseq_61' in LIBRARY_DF.columns: seq = LIBRARY_DF[crit]['targetseq_61'].iloc[0] seq_contexts.append(seq) master_df['_Sequence Context'] = seq_contexts master_df['_Cutsite'] = crispr_cutsite master_df.to_csv(out_dir + '%s_genotypes_%s.csv' % (nm, start)) return
def demultiplex(split, filename): if "AH3W5GBGX9" in filename: print() exp_design = exp_design_2955 exp_test_strs = exp_test_strs_2955 else: exp_design = exp_design_3447 exp_test_strs = exp_test_strs_3447 for name in list(exp_design["Name"]) + ['other']: util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name)) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split))) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split))) print(os.path.join(out_dir, name, '%s' % (filename))) for snum, sgroup in it.groupby(sorted( os.listdir(inp_dir), key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]), key=lambda x: re.compile("(\d+)\.fastq"). search(x).groups()[0]): if snum != split: continue files = list(sgroup) fns = list([sf for sf in files if filename in sf]) print(("LANE: {0}, FILES: {1}".format(snum, fns))) read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e] for e in fns]) inp_fn1 = os.path.join(inp_dir, read_files[1]) inp_fn2 = os.path.join(inp_dir, read_files[2]) lc = util.line_count(inp_fn1) num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0 timer = util.Timer(total=lc) i = -1 ## # Functions ## def match(r1, r2, h1, h2): for k, v in list(exp_test_strs.items()): try: idx = h1.index(v) return k, r1 except ValueError as e: continue return "other", r1 with open(inp_fn1) as f1: with open(inp_fn2) as f2: print(inp_fn1) print(inp_fn2) while 1: i += 1 if i % 10000 == 0: print(( "{0} records, ({1}%) [{2} bad] [{3} other]".format( i / 4, 100 * float(i) / lc, num_bad_q, num_other))) try: line1 = next(f1) line2 = next(f2) except StopIteration as e: break if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: r1 = line1.strip() r2 = line2.strip() if i % 4 == 3: num_tot += 1 qs1 = line1.strip() qs2 = line2.strip() markbad = False for qs in [qs1, qs2]: quals = [ord(s) - 33 for s in qs] if np.mean(quals) < 30: markbad = True if markbad: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(r1, r2, h1, h2) if demultiplex_id == 'other': num_other += 1 out1_fn = out_dir + '%s/%s/R1_%s.fa' % ( filename, demultiplex_id, split) if len(('>' + h1[1:] + '\n' + r1 + '\n').splitlines()) > 2: print('>' + h1[1:] + '\n' + r1 + '\n') raise Exception() #print('>' + h1[1:] + '\n' + r1 + '\n') with open(out1_fn, 'a') as f: f.write('>' + h1[1:] + '\n' + r1 + '\n') out2_fn = out_dir + '%s/%s/R2_%s.fa' % ( filename, demultiplex_id, split) with open(out2_fn, 'a') as f: f.write('>' + h2[1:] + '\n' + r2 + '\n') num_mapped += 1 #timer.update() #logs = pd.Series({"num_bad_q":num_bad_q, # "num_tot":num_tot}) #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv")) print(('Rejected %s fraction of reads' % (num_bad_q / num_tot))) print("<json>" + json.dumps({ "num_bad_q": num_bad_q, "num_tot": num_tot, "num_other": num_other, "num_mapped": num_mapped, }) + "</json>") return
def matchmaker(nm, split): print(nm, split) stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split)) util.exists_empty_fn(stdout_fn) out_dir = os.path.join(out_root_dir, nm, split) util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 print(inp_fn) tot_reads = util.line_count(inp_fn) timer = util.Timer(total=tot_reads) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s) - 33 for s in q2] if np.mean(qs) < 28: qf += 1 continue #l2 = compbio.reverse_complement(l2) #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely? l2 = reverse_complement(l2) #l2 = l2[-62:] align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) align = align.decode("utf-8") # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer alignment_buffer = flush_alignments(alignment_buffer, out_dir) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) #timer.update() # Final flush alignment_buffer = flush_alignments(alignment_buffer, out_dir) return