def check_sequence_name(path_R1, path_R2): with open(path_R1) as path_inf_R1, open(path_R2) as path_inf_R2: fastq_gen_R1 = read_fastq(path_inf_R1) fastq_gen_R2 = read_fastq(path_inf_R2) for gen_R1, gen_R2 in zip(fastq_gen_R1, fastq_gen_R2): title_R1, title_R2 = gen_R1[0], gen_R2[0] if len(title_R1) != len(title_R2): return False diff_idx = [ i for i in range(len(title_R1)) if title_R1[i] != title_R2[i] ] if len(diff_idx) != 1: return False if int(title_R2[diff_idx[0]]) - int(title_R1[diff_idx[0]]) != 1: return False return True
def subsample_fastqs(path_fastqs, num_files=10, num_sequences=1000): for i, path_fastq in enumerate(path_fastqs): with open(path_fastq) as fastq_inf: if i >= num_files: break fastq_gen = read_fastq(fastq_inf) yield limit_fastq(fastq_gen, num_sequences=num_sequences)
def trimmer_learning(flash_output_filenames): filter_q_sum = 0 trim_q_sum = 0 totbases = 0 tottrim = 0 num = 0 for fq_path in flash_output_filenames: with open(fq_path) as fq_inf: fq_gen = read_fastq(fq_inf) for gen in fq_gen: num = num + 1 qualities = gen[2] totbases = totbases + len(qualities) qualities = [ord(qual) - 33 for qual in qualities] filter_q_sum = filter_q_sum + sum(qualities) if (len(qualities) >= 20): trim_q_sum = trim_q_sum + sum(qualities[:10]) + sum( qualities[-10:]) tottrim = tottrim + 20 logging.info('num seqs: %d' % num) logging.info('filter_q_sum: %d' % filter_q_sum) logging.info('trim_q_sum: %d' % trim_q_sum) logging.info('total bases considered: %d (trim: %d)' % (totbases, tottrim)) logging.info('filter_q: %d' % (filter_q_sum / totbases)) logging.info('trim_q: %d' % (trim_q_sum / tottrim)) filter_q = math.floor(filter_q_sum / totbases) trim_q = math.floor(trim_q_sum / tottrim) - 1 trim_q = trim_q if trim_q > filter_q - 3 else filter_q - 3 return filter_q, trim_q
def test(self): path_fastqs = [ os.path.join('testfq', f) for f in os.listdir('testfq') if f.endswith('fastq') ] i = 0 j = 0 for path_fastq in path_fastqs: with open(path_fastq) as inf: fastq_gen = read_fastq(inf) for title, data, qualities in fastq_gen: i += 1 with open(path_fastq) as inf: for line in inf: j += 1 assert i == j / 4
def main(): start_time = datetime.now() parser = make_arg_parser() args = parser.parse_args() learning_params = ["shi7.py"] learning_pretty = ["SHI7 version", VERSION] input = os.path.abspath(args.input) output = os.path.abspath(args.output) # Make output folder if not os.path.exists(output): os.makedirs(output) # Put in the logging file logging.basicConfig(filename=os.path.join(output, 'shi7_learning.log'), filemode='w', level=logging.DEBUG, \ format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # Make temp outfolder if os.path.exists(os.path.join(args.output, 'temp')): shutil.rmtree(os.path.join(args.output, 'temp')) logging.info('Existing temp directory deleted.') os.makedirs(os.path.join(args.output, 'temp')) else: os.makedirs(os.path.join(args.output, 'temp')) path_fastqs = [ os.path.join(input, f) for f in os.listdir(input) if f.endswith('fastq') or f.endswith('fq') ] if len(path_fastqs) == 0: msg = "No FASTQS found in input folder {}".format(input) logging.critical(msg) raise IOError(msg) # Record the input results, addon = template_input(input) logging.info(results) if addon: learning_params.extend(addon) # Write temp subsampled fastqs subsampled_fastq_path = os.path.join(output, 'temp', 'subsampled') os.makedirs(subsampled_fastq_path) totbases = totseqs = 0 for file in path_fastqs: basename = os.path.basename(file) with open(file) as fastq_inf: fastq_gen = read_fastq(fastq_inf) with open(os.path.join(subsampled_fastq_path, basename), 'w') as outf: for header, seq, quality in limit_fastq(fastq_gen): outf.write("@{header}\n{seq}\n+\n{quality}\n".format( header=header, seq=seq, quality=quality)) totbases += len(seq) totseqs += 1 avlen = totbases / totseqs path_fastqs = glob(os.path.join(subsampled_fastq_path, "*")) # Detect if paired end paired_end, pair_obj = detect_paired_end(path_fastqs) path_fastqs = pair_obj[0] link_outdir = os.path.join(output, 'temp', 'link') os.makedirs(link_outdir) snames = [os.path.basename(n) for n in path_fastqs] path_fastqs = link_manicured_names(path_fastqs, snames, link_outdir, not paired_end, pair_obj[1:]) results, addon = template_paired_end(paired_end) logging.info(results) if addon: learning_params.extend(addon) learning_pretty += ["Paired end", paired_end] # Detect adapters axe_adaptors_path = os.path.join(output, 'temp', 'axe_adaptors') os.makedirs(axe_adaptors_path) best_adap, best_size, fastq_paths = choose_axe_adaptors( path_fastqs, paired_end, axe_adaptors_path, int(args.threads)) results, addon = template_choose_axe_adaptors(best_adap, best_size) logging.info(results) if addon: learning_params.extend(addon) learning_pretty += ["Detected adaptors", best_adap] # Detect output folder results, addon = template_output(output) logging.info(results) if addon: learning_params.extend(addon) # Detect stitching stitched_path = os.path.join(output, 'temp', 'flash') os.makedirs(stitched_path) if paired_end: stitches, do_outies, fastq_paths = flash_stitchable_and_check_outies( fastq_paths, stitched_path, int(args.threads)) else: stitches, do_outies = False, False results, addon = template_flash(stitches, do_outies) logging.info(results) if addon: learning_params.extend(addon) if paired_end: learning_pretty += ["Stitching", stitches] if stitches: learning_pretty += ["Outies allowed", do_outies] filt_q, trim_q = trimmer_learning(fastq_paths) results, addon = template_trim(int(filt_q), int(trim_q)) logging.info(results) if addon: learning_params.extend(addon) learning_pretty += ["Filter quality", filt_q, "Trimming quality", trim_q] # Check whether to implement stitching bounds if stitches: cv, mean = flash_check_cv(stitched_path) if cv < 0.1: learning_pretty += ["Amplicon mode", True] logging.info("CV: %f, Mean: %f, Avlen: %f" % (cv, mean, avlen)) if avlen > mean: avlen = mean mr = math.ceil(cv * mean) logging.info("SD was: %d" % mr) minstitch, maxstitch = int(2 * avlen - mean - mr), int(2 * avlen - mean + mr) if minstitch < 8: minstitch = 8 logging.info("Amplicon mode: stitch range [%d, %d]" % (minstitch, maxstitch)) results, addon = template_cv(minstitch, maxstitch) logging.info(results) if addon: learning_params.extend(addon) learning_pretty += ["Amplicon stitch minimum", minstitch] learning_pretty += ["Amplicon stitch maximum", maxstitch] else: learning_pretty += ["Amplicon mode", False] #print(str(learning_params)) with open(os.path.join(args.output, "shi7_cmd.sh"), "w") as output: cmd = " ".join(learning_params) output.write(cmd) print(cmd) with open(os.path.join(args.output, "learning_params.txt"), "w") as output: for ix in range(0, len(learning_pretty), 2): output.write( str(learning_pretty[ix]) + "\t" + str(learning_pretty[ix + 1]) + "\n") if not args.debug: shutil.rmtree(os.path.join(args.output, 'temp')) logging.info('Execution time: %s' % (datetime.now() - start_time))