def main():
    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Do naive overlap...')
    overlap_peak = naive_overlap(
        args.prefix,
        args.peak1,
        args.peak2,
        args.peak_pooled,
        args.peak_type,
        args.nonamecheck,
        args.mem_gb,
        args.out_dir,
    )

    log.info('Blacklist-filtering peaks...')
    bfilt_overlap_peak = blacklist_filter(overlap_peak, args.blacklist,
                                          args.regex_bfilt_peak_chr_name,
                                          args.out_dir)

    log.info('Checking if output is empty...')
    assert_file_not_empty(bfilt_overlap_peak)

    log.info('Converting peak to bigbed...')
    peak_to_bigbed(bfilt_overlap_peak, args.peak_type, args.chrsz, args.mem_gb,
                   args.out_dir)

    log.info('Converting peak to starch...')
    peak_to_starch(bfilt_overlap_peak, args.out_dir)

    log.info('Converting peak to hammock...')
    peak_to_hammock(bfilt_overlap_peak, args.mem_gb, args.out_dir)

    if args.ta:  # if TAG-ALIGN is given
        if args.fraglen:  # chip-seq
            log.info('Shifted FRiP with fragment length...')
            frip_shifted(args.ta, bfilt_overlap_peak, args.chrsz, args.fraglen,
                         args.out_dir)
        else:  # atac-seq
            log.info('FRiP without fragment length...')
            frip(args.ta, bfilt_overlap_peak, args.out_dir)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def main():
    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Do IDR...')
    idr_peak, idr_plot, idr_out_gz, idr_stdout = idr(
        args.prefix,
        args.peak1, args.peak2, args.peak_pooled, args.peak_type,
        args.chrsz,
        args.idr_thresh, args.idr_rank, args.mem_gb, args.out_dir,
    )

    log.info('Checking if output is empty...')
    assert_file_not_empty(idr_peak, help=
        'No IDR peaks found. IDR threshold might be too stringent '
        'or replicates have very poor concordance.')

    log.info('Blacklist-filtering peaks...')
    bfilt_idr_peak = blacklist_filter(
        idr_peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir)

    log.info('Converting peak to bigbed...')
    peak_to_bigbed(bfilt_idr_peak, args.peak_type, args.chrsz,
                   args.mem_gb, args.out_dir)

    log.info('Converting peak to starch...')
    peak_to_starch(bfilt_idr_peak, args.out_dir)

    log.info('Converting peak to hammock...')
    peak_to_hammock(bfilt_idr_peak, args.mem_gb, args.out_dir)

    if args.ta:  # if TAG-ALIGN is given
        if args.fraglen:  # chip-seq
            log.info('Shifted FRiP with fragment length...')
            frip_shifted(args.ta, bfilt_idr_peak,
                         args.chrsz, args.fraglen, args.out_dir)
        else:  # atac-seq
            log.info('FRiP without fragment length...')
            frip(args.ta, bfilt_idr_peak, args.out_dir)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def main():
    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Blacklist-filtering peaks...')
    bfilt_peak = blacklist_filter(args.peak, args.blacklist,
                                  args.regex_bfilt_peak_chr_name, args.out_dir)

    log.info('Checking if output is empty...')
    assert_file_not_empty(bfilt_peak)

    log.info('Converting peak to bigbed...')
    peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz, args.mem_gb,
                   args.out_dir)

    log.info('Converting peak to starch...')
    peak_to_starch(bfilt_peak, args.out_dir)

    log.info('Converting peak to hammock...')
    peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir)

    log.info('Shifted FRiP with fragment length...')
    frip_qc = frip_shifted(args.ta, bfilt_peak, args.chrsz, args.fraglen,
                           args.out_dir)

    log.info('Calculating (blacklist-filtered) peak region size QC/plot...')
    region_size_qc, region_size_plot = get_region_size_metrics(bfilt_peak)

    log.info('Calculating number of peaks (blacklist-filtered)...')
    num_peak_qc = get_num_peaks(bfilt_peak)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
Exemple #4
0
def test_starch_and_unstarch(tmp_path):
    """Cannot use md5hash of output starch file since it includes timestamp.
    So unstarch it and calculate md5 hash.
    This is actually an integration test of starch/unstarch.
    Two functions are tested together:
        - peak_to_starch(): cannot control starch's timestamp inclusion.
        - starch_to_bed_gz(): this function gzips with -n (excluding timestamp).
    """
    peak = tmp_path / 'idr_peak.gz'
    peak.write_text(IDR_PEAK_FILE_CONTENTS)

    starch = peak_to_starch(peak, tmp_path)
    bed_gz = starch_to_bed_gz(starch, tmp_path)

    with open(bed_gz, 'rb') as fp:
        assert hashlib.md5(fp.read()).hexdigest() == MD5_HASH_IDR_PEAK_UNSTARCHED
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Reproducibility QC...')
    # description for variables
    # N: list of number of peaks in peak files from pseudo replicates
    # Nt: top number of peaks in peak files
    #     from true replicates (rep-x_vs_rep-y)
    # Np: number of peaks in peak files from pooled pseudo replicate
    N = [get_num_lines(peak) for peak in args.peaks_pr]
    if len(args.peaks):
        # multiple replicate case
        num_rep = infer_n_from_nC2(len(args.peaks))
        num_peaks_tr = [get_num_lines(peak) for peak in args.peaks]

        Nt = max(num_peaks_tr)
        Np = get_num_lines(args.peak_ppr)
        rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
        self_consistency_ratio = float(max(N)) / float(min(N))

        Nt_idx = num_peaks_tr.index(Nt)
        label_tr = infer_pair_label_from_idx(num_rep, Nt_idx)

        conservative_set = label_tr
        conservative_peak = args.peaks[Nt_idx]
        N_conservative = Nt
        if Nt > Np:
            optimal_set = conservative_set
            optimal_peak = conservative_peak
            N_optimal = N_conservative
        else:
            optimal_set = "pooled-pr1_vs_pooled-pr2"
            optimal_peak = args.peak_ppr
            N_optimal = Np
    else:
        # single replicate case
        num_rep = 1

        Nt = 0
        Np = 0
        rescue_ratio = 0.0
        self_consistency_ratio = 1.0

        conservative_set = 'rep1-pr1_vs_rep1-pr2'
        conservative_peak = args.peaks_pr[0]
        N_conservative = N[0]
        optimal_set = conservative_set
        optimal_peak = conservative_peak
        N_optimal = N_conservative

    reproducibility = 'pass'
    if rescue_ratio > 2.0 or self_consistency_ratio > 2.0:
        reproducibility = 'borderline'
    if rescue_ratio > 2.0 and self_consistency_ratio > 2.0:
        reproducibility = 'fail'

    log.info('Writing optimal/conservative peak files...')
    optimal_peak_file = os.path.join(
        args.out_dir, '{}optimal_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    conservative_peak_file = os.path.join(
        args.out_dir, '{}conservative_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    copy_f_to_f(optimal_peak, optimal_peak_file)
    copy_f_to_f(conservative_peak, conservative_peak_file)

    if args.chrsz:
        log.info('Converting peak to bigbed...')
        peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)
        peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)

        log.info('Converting peak to starch...')
        peak_to_starch(optimal_peak_file, args.out_dir)
        peak_to_starch(conservative_peak_file, args.out_dir)

        log.info('Converting peak to hammock...')
        peak_to_hammock(optimal_peak_file, args.out_dir)
        peak_to_hammock(conservative_peak_file, args.out_dir)

    log.info('Writing reproducibility QC log...')
    if args.prefix:
        reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix)
    else:
        reproducibility_qc = 'reproducibility.qc'
    reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc)

    with open(reproducibility_qc, 'w') as fp:
        header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            'Nt',
            '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]),
            'Np',
            'N_opt',
            'N_consv',
            'opt_set',
            'consv_set',
            'rescue_ratio',
            'self_consistency_ratio',
            'reproducibility',
        )
        line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative,
            optimal_set, conservative_set, rescue_ratio,
            self_consistency_ratio, reproducibility)
        fp.write(header)
        fp.write(line)

    log.info('Calculating (optimal) peak region size QC/plot...')
    region_size_qc, region_size_plot = get_region_size_metrics(
        optimal_peak_file)

    log.info('Calculating number of peaks (optimal)...')
    get_num_peaks(optimal_peak_file)

    log.info('All done.')