def split(inp_fn, out_nm): inp_fn_numlines = util.line_count(inp_fn) num_splits = 30 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 print('Using split size %s' % (split_size)) split_num = 0 for idx in range(1, inp_fn_numlines, split_size): start = idx end = start + split_size out_fn = out_dir + out_nm + '_%s.fastq' % (split_num) command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn, end - start, out_fn) split_num += 1 print(command) return
def demultiplex(split, filename): if "AH3W5GBGX9" in filename: print() exp_design = exp_design_2955 exp_test_strs = exp_test_strs_2955 else: exp_design = exp_design_3447 exp_test_strs = exp_test_strs_3447 for name in list(exp_design["Name"]) + ['other']: util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name)) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split))) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split))) print(os.path.join(out_dir, name, '%s' % (filename))) for snum, sgroup in it.groupby(sorted( os.listdir(inp_dir), key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]), key=lambda x: re.compile("(\d+)\.fastq"). search(x).groups()[0]): if snum != split: continue files = list(sgroup) fns = list([sf for sf in files if filename in sf]) print(("LANE: {0}, FILES: {1}".format(snum, fns))) read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e] for e in fns]) inp_fn1 = os.path.join(inp_dir, read_files[1]) inp_fn2 = os.path.join(inp_dir, read_files[2]) lc = util.line_count(inp_fn1) num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0 timer = util.Timer(total=lc) i = -1 ## # Functions ## def match(r1, r2, h1, h2): for k, v in list(exp_test_strs.items()): try: idx = h1.index(v) return k, r1 except ValueError as e: continue return "other", r1 with open(inp_fn1) as f1: with open(inp_fn2) as f2: print(inp_fn1) print(inp_fn2) while 1: i += 1 if i % 10000 == 0: print(( "{0} records, ({1}%) [{2} bad] [{3} other]".format( i / 4, 100 * float(i) / lc, num_bad_q, num_other))) try: line1 = next(f1) line2 = next(f2) except StopIteration as e: break if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: r1 = line1.strip() r2 = line2.strip() if i % 4 == 3: num_tot += 1 qs1 = line1.strip() qs2 = line2.strip() markbad = False for qs in [qs1, qs2]: quals = [ord(s) - 33 for s in qs] if np.mean(quals) < 30: markbad = True if markbad: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(r1, r2, h1, h2) if demultiplex_id == 'other': num_other += 1 out1_fn = out_dir + '%s/%s/R1_%s.fa' % ( filename, demultiplex_id, split) if len(('>' + h1[1:] + '\n' + r1 + '\n').splitlines()) > 2: print('>' + h1[1:] + '\n' + r1 + '\n') raise Exception() #print('>' + h1[1:] + '\n' + r1 + '\n') with open(out1_fn, 'a') as f: f.write('>' + h1[1:] + '\n' + r1 + '\n') out2_fn = out_dir + '%s/%s/R2_%s.fa' % ( filename, demultiplex_id, split) with open(out2_fn, 'a') as f: f.write('>' + h2[1:] + '\n' + r2 + '\n') num_mapped += 1 #timer.update() #logs = pd.Series({"num_bad_q":num_bad_q, # "num_tot":num_tot}) #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv")) print(('Rejected %s fraction of reads' % (num_bad_q / num_tot))) print("<json>" + json.dumps({ "num_bad_q": num_bad_q, "num_tot": num_tot, "num_other": num_other, "num_mapped": num_mapped, }) + "</json>") return
def matchmaker(nm, split): print(nm, split) stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split)) util.exists_empty_fn(stdout_fn) out_dir = os.path.join(out_root_dir, nm, split) util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 print(inp_fn) tot_reads = util.line_count(inp_fn) timer = util.Timer(total=tot_reads) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s) - 33 for s in q2] if np.mean(qs) < 28: qf += 1 continue #l2 = compbio.reverse_complement(l2) #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely? l2 = reverse_complement(l2) #l2 = l2[-62:] align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) align = align.decode("utf-8") # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer alignment_buffer = flush_alignments(alignment_buffer, out_dir) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) #timer.update() # Final flush alignment_buffer = flush_alignments(alignment_buffer, out_dir) return