def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
Beispiel #2
0
def idr(basename_prefix, peak1, peak2, peak_pooled, peak_type, chrsz,
        thresh, rank, out_dir):
    prefix = os.path.join(out_dir, basename_prefix)
    prefix += '.idr{}'.format(thresh)
    idr_peak = '{}.{}.gz'.format(prefix, peak_type)
    idr_plot = '{}.unthresholded-peaks.txt.png'.format(prefix)
    idr_stdout = '{}.log'.format(prefix)
    # temporary
    idr_12col_bed = '{}.12-col.bed.gz'.format(peak_type)
    idr_out = '{}.unthresholded-peaks.txt'.format(prefix)
    idr_tmp = '{}.unthresholded-peaks.txt.tmp'.format(prefix)
    idr_out_gz = '{}.unthresholded-peaks.txt.gz'.format(prefix)

    cmd1 = 'idr --samples {} {} --peak-list {} --input-file-type narrowPeak '
    cmd1 += '--output-file {} --rank {} --soft-idr-threshold {} '
    cmd1 += '--plot --use-best-multisummit-IDR --log-output-file {}'
    cmd1 = cmd1.format(
        peak1,
        peak2,
        peak_pooled,
        idr_out,
        rank,
        thresh,
        idr_stdout)
    run_shell_cmd(cmd1)

    # clip peaks between 0-chromSize.
    bed_clip(idr_out, chrsz, idr_tmp, no_gz=True)

    col = get_npeak_col_by_rank(rank)
    neg_log10_thresh = -math.log10(thresh)
    # LC_COLLATE=C
    cmd2 = 'awk \'BEGIN{{OFS="\\t"}} $12>={} '
    cmd2 += '{{if ($2<0) $2=0; '
    cmd2 += 'print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' {} '
    cmd2 += '| sort | uniq | sort -grk{},{} | gzip -nc > {}'
    cmd2 = cmd2.format(
        neg_log10_thresh,
        idr_tmp,
        col,
        col,
        idr_12col_bed)
    run_shell_cmd(cmd2)

    cmd3 = 'zcat {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd3 += '{{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}}\' | '
    cmd3 += 'gzip -nc > {}'
    cmd3 = cmd3.format(
        idr_12col_bed,
        idr_peak)
    run_shell_cmd(cmd3)

    cmd4 = 'cat {} | gzip -nc > {}'.format(idr_tmp, idr_out_gz)
    run_shell_cmd(cmd4)

    rm_f([idr_out, idr_tmp, idr_12col_bed])
    rm_f('{}.*.noalternatesummitpeaks.png'.format(prefix))
    return idr_peak, idr_plot, idr_out_gz, idr_stdout
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, mem_gb,
          out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    # temporary files
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    run_shell_cmd('macs2 callpeak '
                  '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
                  '--shift {shiftsize} --extsize {extsize} '
                  '--nomodel -B --SPMR --keep-dup all --call-summits'.format(
                      ta=ta,
                      prefix=prefix,
                      gensz=gensz,
                      pval_thresh=pval_thresh,
                      shiftsize=shiftsize,
                      extsize=smooth_win,
                  ))

    run_shell_cmd(
        'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}"_peaks.narrowPeak | '
        'awk \'BEGIN{{OFS="\\t"}}'
        '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
        '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            prefix=prefix,
            npeak_tmp=npeak_tmp,
        ))

    run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format(
        cap_num_peak=cap_num_peak,
        npeak_tmp=npeak_tmp,
        npeak_tmp2=npeak_tmp2,
    ))

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return npeak
Beispiel #4
0
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample,
        ctl_paired_end, nth, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))

    if ctl_subsample:
        if ctl_paired_end:
            ctl_ta = subsample_ta_pe(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     r1_only=False,
                                     out_dir=out_dir)
        else:
            ctl_ta = subsample_ta_se(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     out_dir=out_dir)
    basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
    basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
    if len(basename_prefix) > 200:  # UNIX cannot have filename > 255
        basename_prefix = '{}_x_control'.format(basename_ta)
    nth_param = '-p={}'.format(nth) if nth >= 2 else ''
    prefix = os.path.join(out_dir, basename_prefix)
    rpeak = '{}.{}.regionPeak.gz'.format(prefix,
                                         human_readable_number(cap_num_peak))
    rpeak_tmp_prefix = '{}.tmp'.format(rpeak)
    rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak)
    rpeak_tmp2 = '{}.tmp2'.format(rpeak)

    cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} '
    cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}'
    cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir),
                       fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param)
    run_shell_cmd(cmd0)

    # if we have scientific representation of chr coord. then convert it to int
    cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{if ($2<0) $2=0; '
    cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}'
    cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2)
    run_shell_cmd(cmd1)
    rm_f(rpeak_tmp_gz)

    # clip peaks between 0-chromSize.
    bed_clip(rpeak_tmp2, chrsz, rpeak)
    rm_f(rpeak_tmp2)

    return rpeak
Beispiel #5
0
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    # temporary files
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
Beispiel #6
0
def test_bed_clip(tmp_path):
    chrsz = tmp_path / 'chrsz'
    chrsz.write_text(CHRSZ_HG38)

    out_bed = tmp_path / 'out.bed'

    # no change if it's inside 0-chrsz
    bed_in = tmp_path / 'test.in.bed'
    bed_in.write_text('chrM\t2\t10000\n')
    bed_clip(str(bed_in), str(chrsz), str(out_bed), no_gz=True)
    assert out_bed.read_text() == 'chrM\t2\t10000\n'

    # out-of-bound error should occur
    bed_oob1 = tmp_path / 'test.oob1.bed'
    bed_oob1.write_text('chrM\t-10\t-1\n')
    with pytest.raises(Exception):
        bed_clip(str(bed_oob1), str(chrsz), str(out_bed), no_gz=True)

    # should be truncated to 0 (left) or chrsz (right)
    bed_left_crossing = tmp_path / 'test.lc.bed'
    bed_left_crossing.write_text('chrM\t-1\t13000\n')
    bed_right_crossing = tmp_path / 'test.rc.bed'
    bed_right_crossing.write_text('chrM\t13000\t17000\n')
    bed_larger = tmp_path / 'test.l.bed'
    bed_larger.write_text('chrM\t-1\t17000\n')    

    bed_clip(str(bed_left_crossing), str(chrsz), str(out_bed), no_gz=True)
    assert out_bed.read_text() == 'chrM\t0\t13000\n'

    bed_clip(str(bed_right_crossing), str(chrsz), str(out_bed), no_gz=True)
    assert out_bed.read_text() == 'chrM\t13000\t16569\n'

    bed_clip(str(bed_larger), str(chrsz), str(out_bed), no_gz=True)
    assert out_bed.read_text() == 'chrM\t0\t16569\n'

    # test no_gz flag
    out_bed = tmp_path / 'out.bed.gz'
    bed_clip(str(bed_larger), str(chrsz), str(out_bed), no_gz=False)
    with gzip.open(str(out_bed), 'rb') as fp:
        assert fp.read().decode() == 'chrM\t0\t16569\n'
Beispiel #7
0
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd(
        'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | '
        'awk \'BEGIN{{OFS="\\t"}}'
        '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
        '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            prefix=prefix,
            npeak_tmp=npeak_tmp,
        ))

    run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format(
        cap_num_peak=cap_num_peak,
        npeak_tmp=npeak_tmp,
        npeak_tmp2=npeak_tmp2,
    ))

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return npeak
def idr(basename_prefix, peak1, peak2, peak_pooled, peak_type, chrsz,
        thresh, rank, mem_gb, out_dir):
    prefix = os.path.join(out_dir, basename_prefix)
    prefix += '.idr{}'.format(thresh)
    idr_peak = '{}.{}.gz'.format(prefix, peak_type)
    idr_plot = '{}.unthresholded-peaks.txt.png'.format(prefix)
    idr_stdout = '{}.log'.format(prefix)
    # temporary
    idr_12col_bed = '{}.12-col.bed.gz'.format(peak_type)
    idr_out = '{}.unthresholded-peaks.txt'.format(prefix)
    idr_tmp = '{}.unthresholded-peaks.txt.tmp'.format(prefix)
    idr_out_gz = '{}.unthresholded-peaks.txt.gz'.format(prefix)

    run_shell_cmd(
        'idr --samples {peak1} {peak2} --peak-list {peak_pooled} --input-file-type narrowPeak '
        '--output-file {idr_out} --rank {rank} --soft-idr-threshold {thresh} '
        '--plot --use-best-multisummit-IDR --log-output-file {idr_stdout}'.format(
            peak1=peak1,
            peak2=peak2,
            peak_pooled=peak_pooled,
            idr_out=idr_out,
            rank=rank,
            thresh=thresh,
            idr_stdout=idr_stdout,
        )
    )

    # clip peaks between 0-chromSize.
    bed_clip(idr_out, chrsz, idr_tmp, no_gz=True)

    col = get_npeak_col_by_rank(rank)
    neg_log10_thresh = -math.log10(thresh)
    # LC_COLLATE=C
    run_shell_cmd(
        'awk \'BEGIN{{OFS="\\t"}} $12>={neg_log10_thresh} '
        '{{if ($2<0) $2=0; '
        'print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' {idr_tmp} '
        '| sort {sort_param} | uniq | sort -grk{col},{col} {sort_param} | gzip -nc > {idr_12col_bed}'.format(
            neg_log10_thresh=neg_log10_thresh,
            idr_tmp=idr_tmp,
            sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
            col=col,
            idr_12col_bed=idr_12col_bed,
        )
    )

    run_shell_cmd(
        'zcat {idr_12col_bed} | '
        'awk \'BEGIN{{OFS="\\t"}} '
        '{{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}}\' | '
        'gzip -nc > {idr_peak}'.format(
            idr_12col_bed=idr_12col_bed,
            idr_peak=idr_peak,
        )
    )

    run_shell_cmd(
        'cat {idr_tmp} | gzip -nc > {idr_out_gz}'.format(
            idr_tmp=idr_tmp,
            idr_out_gz=idr_out_gz,
        )
    )

    rm_f([idr_out, idr_tmp, idr_12col_bed])
    rm_f('{prefix}.*.noalternatesummitpeaks.png'.format(prefix=prefix))
    return idr_peak, idr_plot, idr_out_gz, idr_stdout