Example #1
0
def user_fastq_files(fq_se, fq_pe):
    if len(fq_se) > 0 or len(fq_pe) > 0:
        print()
        Log.inf('Preparing user provided FASTQ files.')

    se_fastq_files = {}
    pe_fastq_files = {}

    fq_type_1_regex = r'(.*)_L\d\d\d(_R.)_\d\d\d(.*)'

    for se in fq_se:
        tax_id = se[0]
        path = se[1]
        base = basename(path)
        if plain_or_gzip(base)[4] != '':
            base = splitext(base)[0]
        base = splitext(base)[0]
        fq_type_1_match = re.findall(fq_type_1_regex, base)
        if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3:
            base = fq_type_1_match[0][0]
        sample_base_name = base
        se_fastq_files[sample_base_name] = {'path': path}
        se_fastq_files[sample_base_name]['src'] = 'usr'
        se_fastq_files[sample_base_name]['avg_len'] = None
        se_fastq_files[sample_base_name]['tax_id'] = tax_id
        Log.msg(sample_base_name + ':', basename(path))

    for pe in fq_pe:
        tax_id = pe[0]
        path = pe[1]
        base = basename(path[0])
        if plain_or_gzip(base)[4] != '':
            base = splitext(base)[0]
        base = splitext(base)[0]
        fq_type_1_match = re.findall(fq_type_1_regex, base)
        if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3:
            base = fq_type_1_match[0][0]
        else:
            base = basename(commonprefix(path)).rstrip('_- R')
        sample_base_name = base
        pe_fastq_files[sample_base_name] = {'path': path}
        pe_fastq_files[sample_base_name]['src'] = 'usr'
        pe_fastq_files[sample_base_name]['avg_len'] = None
        pe_fastq_files[sample_base_name]['tax_id'] = tax_id
        Log.msg(
            sample_base_name + ':',
            basename(path[0]) + '\n' + ' ' * (len(sample_base_name) + 2) +
            basename(path[1]))

    return se_fastq_files, pe_fastq_files
Example #2
0
def filter_unc_se(in_file, out_file, log_file=None):

    r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(in_file)

    counter = 0
    cor_count = 0
    unc_count = 0

    with fqopen(in_file, r_mode) as in_f, fqopen(out_file, w_mode) as out_f:
        entries = grouper(in_f, 4)
        for entry in entries:
            counter += 1
            # if counter % 100000 == 0:
            #     print('{} reads processed.'.format(counter))
            head, seq, plhld, qual = [i.strip() for i in entry]

            if 'unfixable' in head:
                unc_count += 1

            else:
                if 'cor' in head:
                    cor_count += 1

                # Keep the label information before the Rcorrector flags
                # (low kmer stat, 'cor' and 'unfixable error')
                head = head.split('l:')[0][:-1]
                entry_corrected = '\n'.join([head, seq, plhld, qual])
                out_f.write('{}\n'.format(entry_corrected))

    if log_file is not None:

        log_str = ('Total SE reads: {}\n'
                   'Uncorrectable SE reads removed: {} - {:.2f}%\n'
                   'Corrected SE reads retained: {} - {:.2f}%\n'
                   'Total SE reads retained: {} - {:.2f}%\n')

        with open(log_file, 'w') as f:
            f.write(
                log_str.format(counter, unc_count, (unc_count / counter) * 100,
                               cor_count, (cor_count / counter) * 100,
                               counter - unc_count,
                               ((counter - unc_count) / counter) * 100))
Example #3
0
def filter_unc_pe(in_file_1, in_file_2, out_file_1, out_file_2, log_file=None):

    r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(in_file_1)

    counter = 0
    cor_1_count = 0
    cor_2_count = 0
    cor_12_count = 0
    unc_count = 0

    with fqopen(in_file_1, r_mode) as in_f_1, \
            fqopen(in_file_2, r_mode) as in_f_2, \
            fqopen(out_file_1, w_mode) as out_f_1, \
            fqopen(out_file_2, w_mode) as out_f_2:

        entries_1 = grouper(in_f_1, 4)
        entries_2 = grouper(in_f_2, 4)

        for entry_1 in entries_1:
            entry_2 = next(entries_2)
            counter += 1
            # if counter % 100000 == 0:
            #     print('{} reads processed.'.format(counter))
            head_1, seq_1, plhld_1, qual_1 = [i.strip() for i in entry_1]
            head_2, seq_2, plhld_2, qual_2 = [j.strip() for j in entry_2]

            if 'unfixable' in head_1 or 'unfixable' in head_2:
                unc_count += 1
            else:
                if 'cor' in head_1 and 'cor' in head_2:
                    cor_12_count += 1
                else:
                    if 'cor' in head_1:
                        cor_1_count += 1
                    elif 'cor' in head_2:
                        cor_2_count += 1

                # Keep the label information before the Rcorrector flags
                # (low kmer stat, 'cor' and 'unfixable error')
                head_1 = head_1.split('l:')[0][:-1]
                entry_1_corrected = '\n'.join([head_1, seq_1, plhld_1, qual_1])
                out_f_1.write('{}\n'.format(entry_1_corrected))

                head_2 = head_2.split('l:')[0][:-1]
                entry_2_corrected = '\n'.join([head_2, seq_2, plhld_2, qual_2])
                out_f_2.write('{}\n'.format(entry_2_corrected))

    if log_file is not None:

        log_str = ('Total PE reads: {}\n'
                   'Uncorrectable PE reads removed: {} - {:.2f}%\n'
                   'Corrected R1 reads retained: {} - {:.2f}%\n'
                   'Corrected R2 reads retained: {} - {:.2f}%\n'
                   'Corrected PE reads retained: {} - {:.2f}%\n'
                   'Total PE reads retained: {} - {:.2f}%\n')

        with open(log_file, 'w') as f:
            f.write(
                log_str.format(counter, unc_count, (unc_count / counter) * 100,
                               cor_1_count, (cor_1_count / counter) * 100,
                               cor_2_count, (cor_2_count / counter) * 100,
                               cor_12_count, (cor_12_count / counter) * 100,
                               counter - unc_count,
                               ((counter - unc_count) / counter) * 100))
Example #4
0
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, fpatt, threads):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running Trimmomatic.')
        if trimmomatic is None:
            Log.err('trimmomatic is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, se)
        fq_path = se_fastq_files[se]['cor_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, se + '.txt')
        out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext)
        se_fastq_files[se]['trim_path_fq'] = out_f

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('SE mode:', se)
            trimmomatic_se(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file=fq_path,
                           out_file=out_f,
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

    for pe in pe_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe)
        fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1]
        fq_path_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        if len(pe_fastq_files[pe]['cor_path_fq']) == 3:
            fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2]
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, pe + '.txt')
        out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]
        pe_fastq_files[pe]['trim_path_fq'] = out_fs

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('PE mode:', pe)
            trimmomatic_pe(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file_1=fq_path_1,
                           in_file_2=fq_path_2,
                           out_file_paired_1=out_fs[0],
                           out_file_paired_2=out_fs[1],
                           out_file_unpaired_1=out_fs[2],
                           out_file_unpaired_2=out_fs[3],
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

            if fq_path_3 is not None:

                out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext)
                stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt')

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                trimmomatic_se(trimmomatic=trimmomatic,
                               adapters=adapters,
                               in_file=fq_path_3,
                               out_file=out_f,
                               stats_file=stats_f,
                               threads=threads,
                               minlen=min_acc_len)

                _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext)
                f_temp = fqopen(_, w_mode)
                with fileinput.FileInput(
                        files=[out_fs[2], out_f],
                        openhook=fileinput.hook_compressed) as f:
                    for line in f:
                        f_temp.write(line)
                f_temp.close()

                remove(out_fs[2])
                remove(out_f)
                copyfile(_, out_fs[2])
                remove(_)
Example #5
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

        if rcorrector is None:
            Log.err('Rcorrector is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['path']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)
        fq_path_1 = pe_fastq_files[pe]['path'][0]
        fq_path_2 = pe_fastq_files[pe]['path'][1]
        fq_path_3 = None
        out_f_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '.txt')
        out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext)
        out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext)

        pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2]

        if len(pe_fastq_files[pe]['path']) == 3:
            fq_path_3 = pe_fastq_files[pe]['path'][2]
            out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext)
            pe_fastq_files[pe]['cor_path_fq'].append(out_f_3)

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2]
            if fq_path_3 is not None:
                pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3)
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_f_1,
                          out_file_2=out_f_2,
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            if fq_path_3 is not None:

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq'
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_f_3,
                              log_file=log_f_3)

                remove(fq_cor_path_3)
Example #6
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, fpatt, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

            if rcorrector is None:
                Log.err(
                    'Rcorrector is not available. Cannot continue. Exiting.')
                exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['trim_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)

        fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1]
        fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2]
        fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3]

        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt')

        out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]

        pe_fastq_files[pe]['cor_path_fq'] = out_fs

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [
                fq_path_1, fq_path_2, fq_path_3, fq_path_4
            ]
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_fs[0],
                          out_file_2=out_fs[1],
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            # unpaired 1
            if stat(fq_path_3).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(
                    fq_base_path_3)[0] + '.cor.fq' + ext
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_fs[2],
                              log_file=log_f_3)

                remove(fq_cor_path_3)
            else:
                with open(out_fs[2], 'w') as f:
                    f.write('')

            # unpaired 2
            if stat(fq_path_4).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_4,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_4 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_4))
                fq_cor_path_4 = splitext_gz(
                    fq_base_path_4)[0] + '.cor.fq' + ext
                log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt')

                filter_unc_se(in_file=fq_cor_path_4,
                              out_file=out_fs[3],
                              log_file=log_f_4)
                remove(fq_cor_path_4)

            else:
                with open(out_fs[3], 'w') as f:
                    f.write('')