def user_fastq_files(fq_se, fq_pe): if len(fq_se) > 0 or len(fq_pe) > 0: print() Log.inf('Preparing user provided FASTQ files.') se_fastq_files = {} pe_fastq_files = {} fq_type_1_regex = r'(.*)_L\d\d\d(_R.)_\d\d\d(.*)' for se in fq_se: tax_id = se[0] path = se[1] base = basename(path) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] sample_base_name = base se_fastq_files[sample_base_name] = {'path': path} se_fastq_files[sample_base_name]['src'] = 'usr' se_fastq_files[sample_base_name]['avg_len'] = None se_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg(sample_base_name + ':', basename(path)) for pe in fq_pe: tax_id = pe[0] path = pe[1] base = basename(path[0]) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] else: base = basename(commonprefix(path)).rstrip('_- R') sample_base_name = base pe_fastq_files[sample_base_name] = {'path': path} pe_fastq_files[sample_base_name]['src'] = 'usr' pe_fastq_files[sample_base_name]['avg_len'] = None pe_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg( sample_base_name + ':', basename(path[0]) + '\n' + ' ' * (len(sample_base_name) + 2) + basename(path[1])) return se_fastq_files, pe_fastq_files
def filter_unc_se(in_file, out_file, log_file=None): r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(in_file) counter = 0 cor_count = 0 unc_count = 0 with fqopen(in_file, r_mode) as in_f, fqopen(out_file, w_mode) as out_f: entries = grouper(in_f, 4) for entry in entries: counter += 1 # if counter % 100000 == 0: # print('{} reads processed.'.format(counter)) head, seq, plhld, qual = [i.strip() for i in entry] if 'unfixable' in head: unc_count += 1 else: if 'cor' in head: cor_count += 1 # Keep the label information before the Rcorrector flags # (low kmer stat, 'cor' and 'unfixable error') head = head.split('l:')[0][:-1] entry_corrected = '\n'.join([head, seq, plhld, qual]) out_f.write('{}\n'.format(entry_corrected)) if log_file is not None: log_str = ('Total SE reads: {}\n' 'Uncorrectable SE reads removed: {} - {:.2f}%\n' 'Corrected SE reads retained: {} - {:.2f}%\n' 'Total SE reads retained: {} - {:.2f}%\n') with open(log_file, 'w') as f: f.write( log_str.format(counter, unc_count, (unc_count / counter) * 100, cor_count, (cor_count / counter) * 100, counter - unc_count, ((counter - unc_count) / counter) * 100))
def filter_unc_pe(in_file_1, in_file_2, out_file_1, out_file_2, log_file=None): r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(in_file_1) counter = 0 cor_1_count = 0 cor_2_count = 0 cor_12_count = 0 unc_count = 0 with fqopen(in_file_1, r_mode) as in_f_1, \ fqopen(in_file_2, r_mode) as in_f_2, \ fqopen(out_file_1, w_mode) as out_f_1, \ fqopen(out_file_2, w_mode) as out_f_2: entries_1 = grouper(in_f_1, 4) entries_2 = grouper(in_f_2, 4) for entry_1 in entries_1: entry_2 = next(entries_2) counter += 1 # if counter % 100000 == 0: # print('{} reads processed.'.format(counter)) head_1, seq_1, plhld_1, qual_1 = [i.strip() for i in entry_1] head_2, seq_2, plhld_2, qual_2 = [j.strip() for j in entry_2] if 'unfixable' in head_1 or 'unfixable' in head_2: unc_count += 1 else: if 'cor' in head_1 and 'cor' in head_2: cor_12_count += 1 else: if 'cor' in head_1: cor_1_count += 1 elif 'cor' in head_2: cor_2_count += 1 # Keep the label information before the Rcorrector flags # (low kmer stat, 'cor' and 'unfixable error') head_1 = head_1.split('l:')[0][:-1] entry_1_corrected = '\n'.join([head_1, seq_1, plhld_1, qual_1]) out_f_1.write('{}\n'.format(entry_1_corrected)) head_2 = head_2.split('l:')[0][:-1] entry_2_corrected = '\n'.join([head_2, seq_2, plhld_2, qual_2]) out_f_2.write('{}\n'.format(entry_2_corrected)) if log_file is not None: log_str = ('Total PE reads: {}\n' 'Uncorrectable PE reads removed: {} - {:.2f}%\n' 'Corrected R1 reads retained: {} - {:.2f}%\n' 'Corrected R2 reads retained: {} - {:.2f}%\n' 'Corrected PE reads retained: {} - {:.2f}%\n' 'Total PE reads retained: {} - {:.2f}%\n') with open(log_file, 'w') as f: f.write( log_str.format(counter, unc_count, (unc_count / counter) * 100, cor_1_count, (cor_1_count / counter) * 100, cor_2_count, (cor_2_count / counter) * 100, cor_12_count, (cor_12_count / counter) * 100, counter - unc_count, ((counter - unc_count) / counter) * 100))
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data, trimmomatic, adapters, fpatt, threads): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Running Trimmomatic.') if trimmomatic is None: Log.err('trimmomatic is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, se) fq_path = se_fastq_files[se]['cor_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) min_acc_len = se_fastq_files[se]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, se + '.txt') out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext) se_fastq_files[se]['trim_path_fq'] = out_f if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ file already exists:', se) else: make_dirs(dir_fq_trim_data_sample) Log.msg('SE mode:', se) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) for pe in pe_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe) fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1] fq_path_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) if len(pe_fastq_files[pe]['cor_path_fq']) == 3: fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2] min_acc_len = pe_fastq_files[pe]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, pe + '.txt') out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['trim_path_fq'] = out_fs if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ files already exist:', pe) else: make_dirs(dir_fq_trim_data_sample) Log.msg('PE mode:', pe) trimmomatic_pe(trimmomatic=trimmomatic, adapters=adapters, in_file_1=fq_path_1, in_file_2=fq_path_2, out_file_paired_1=out_fs[0], out_file_paired_2=out_fs[1], out_file_unpaired_1=out_fs[2], out_file_unpaired_2=out_fs[3], stats_file=stats_f, threads=threads, minlen=min_acc_len) if fq_path_3 is not None: out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext) stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt') Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path_3, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext) f_temp = fqopen(_, w_mode) with fileinput.FileInput( files=[out_fs[2], out_f], openhook=fileinput.hook_compressed) as f: for line in f: f_temp.write(line) f_temp.close() remove(out_fs[2]) remove(out_f) copyfile(_, out_fs[2]) remove(_)
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err('Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['path'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['path'][0] fq_path_2 = pe_fastq_files[pe]['path'][1] fq_path_3 = None out_f_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '.txt') out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext) out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2] if len(pe_fastq_files[pe]['path']) == 3: fq_path_3 = pe_fastq_files[pe]['path'][2] out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'].append(out_f_3) if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2] if fq_path_3 is not None: pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3) continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_f_1, out_file_2=out_f_2, log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) if fq_path_3 is not None: Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq' log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_f_3, log_file=log_f_3) remove(fq_cor_path_3)
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, fpatt, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err( 'Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['trim_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1] fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2] fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt') out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['cor_path_fq'] = out_fs if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [ fq_path_1, fq_path_2, fq_path_3, fq_path_4 ] continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_fs[0], out_file_2=out_fs[1], log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) # unpaired 1 if stat(fq_path_3).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz( fq_base_path_3)[0] + '.cor.fq' + ext log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_fs[2], log_file=log_f_3) remove(fq_cor_path_3) else: with open(out_fs[2], 'w') as f: f.write('') # unpaired 2 if stat(fq_path_4).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_4, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_4 = opj(dir_fq_cor_data_sample, basename(fq_path_4)) fq_cor_path_4 = splitext_gz( fq_base_path_4)[0] + '.cor.fq' + ext log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt') filter_unc_se(in_file=fq_cor_path_4, out_file=out_fs[3], log_file=log_f_4) remove(fq_cor_path_4) else: with open(out_fs[3], 'w') as f: f.write('')