def spp(ta, ctl_ta, fraglen, cap_num_peak, nth, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
    basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
    if len(basename_prefix) > 200:  # UNIX cannot have filename > 255
        basename_prefix = '{}_x_control'.format(basename_ta)
    nth_param = '-p={}'.format(nth) if nth < 2 else ''
    prefix = os.path.join(out_dir, basename_prefix)
    rpeak = '{}.{}.regionPeak.gz'.format(prefix,
                                         human_readable_number(cap_num_peak))
    rpeak_tmp = '{}.tmp'.format(rpeak)
    rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak)

    cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} '
    cmd0 += '-npeak={} -odir={} -speak={} -savr={} -rf {}'
    cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir),
                       fraglen, rpeak_tmp, nth_param)
    run_shell_cmd(cmd0)

    # if we have scientific representation of chr coord. then convert it to int
    cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{if ($2<0) $2=0; '
    cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' | '
    cmd1 += 'gzip -f -nc > {}'
    cmd1 = cmd1.format(rpeak_tmp, rpeak)
    run_shell_cmd(cmd1)
    rm_f([rpeak_tmp, rpeak_tmp_gz])

    return rpeak
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
Example #3
0
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample,
        ctl_paired_end, nth, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))

    if ctl_subsample:
        if ctl_paired_end:
            ctl_ta = subsample_ta_pe(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     r1_only=False,
                                     out_dir=out_dir)
        else:
            ctl_ta = subsample_ta_se(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     out_dir=out_dir)
    basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
    basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
    if len(basename_prefix) > 200:  # UNIX cannot have filename > 255
        basename_prefix = '{}_x_control'.format(basename_ta)
    nth_param = '-p={}'.format(nth) if nth >= 2 else ''
    prefix = os.path.join(out_dir, basename_prefix)
    rpeak = '{}.{}.regionPeak.gz'.format(prefix,
                                         human_readable_number(cap_num_peak))
    rpeak_tmp_prefix = '{}.tmp'.format(rpeak)
    rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak)
    rpeak_tmp2 = '{}.tmp2'.format(rpeak)

    cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} '
    cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}'
    cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir),
                       fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param)
    run_shell_cmd(cmd0)

    # if we have scientific representation of chr coord. then convert it to int
    cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{if ($2<0) $2=0; '
    cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}'
    cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2)
    run_shell_cmd(cmd1)
    rm_f(rpeak_tmp_gz)

    # clip peaks between 0-chromSize.
    bed_clip(rpeak_tmp2, chrsz, rpeak)
    rm_f(rpeak_tmp2)

    return rpeak
def spr_se(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) + 1) / 2)

    # bash-only
    cmd1 = 'zcat {} | shuf --random-source=<(openssl enc '
    cmd1 += '-aes-256-ctr -pass pass:$(zcat -f {} | wc -c) '
    cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
    cmd1 += 'split -d -l {} - {}.'
    cmd1 = cmd1.format(ta, ta, nlines, prefix)
    run_shell_cmd(cmd1)

    cmd2 = 'gzip -nc {} > {}'
    cmd2 = cmd2.format(tmp_pr1, ta_pr1)
    run_shell_cmd(cmd2)

    cmd3 = 'gzip -nc {} > {}'
    cmd3 = cmd3.format(tmp_pr2, ta_pr2)
    run_shell_cmd(cmd3)

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def count_signal_track(ta, chrsz, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    pos_bw = '{}.positive.bigwig'.format(prefix)
    neg_bw = '{}.negative.bigwig'.format(prefix)
    # temporary files
    pos_bedgraph = '{}.positive.bedgraph'.format(prefix)
    neg_bedgraph = '{}.negative.bedgraph'.format(prefix)

    temp_files = []

    cmd1 = 'zcat -f {} | sort -k1,1 -k2,2n | '
    cmd1 += 'bedtools genomecov -5 -bg -strand + -g {} -i stdin > {}'
    cmd1 = cmd1.format(ta, chrsz, pos_bedgraph)
    run_shell_cmd(cmd1)

    cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | '
    cmd2 += 'bedtools genomecov -5 -bg -strand - -g {} -i stdin > {}'
    cmd2 = cmd2.format(ta, chrsz, neg_bedgraph)
    run_shell_cmd(cmd2)

    cmd3 = 'bedGraphToBigWig {} {} {}'
    cmd3 = cmd3.format(pos_bedgraph, chrsz, pos_bw)
    run_shell_cmd(cmd3)

    cmd4 = 'bedGraphToBigWig {} {} {}'
    cmd4 = cmd4.format(neg_bedgraph, chrsz, neg_bw)
    run_shell_cmd(cmd4)

    # remove temporary files
    temp_files.append(pos_bedgraph)
    temp_files.append(neg_bedgraph)
    rm_f(temp_files)

    return pos_bw, neg_bw
def subsample_ta_se(ta, subsample, non_mito, mito_chr_name, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_ta(ta)))
    ta_subsampled = '{}.{}{}tagAlign.gz'.format(
        prefix,
        'no_chrM.' if non_mito else '',
        '{}.'.format(human_readable_number(subsample)) if subsample > 0 else ''
    )

    # bash-only
    cmd = 'zcat -f {} | '
    if non_mito:
        # cmd += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | '
        cmd += 'grep -v \'^'+mito_chr_name+'\\b\' | '
    if subsample > 0:
        cmd += 'shuf -n {} --random-source=<(openssl enc -aes-256-ctr '
        cmd += '-pass pass:$(zcat -f {} | wc -c) -nosalt '
        cmd += '</dev/zero 2>/dev/null) | '
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            ta,
            subsample,
            ta,
            ta_subsampled)
    else:
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            ta,
            ta_subsampled)

    run_shell_cmd(cmd)
    return ta_subsampled
def pool_ta(tas, out_dir):
    if len(tas) > 1:
        prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(tas[0])))
        pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix)

        cmd = 'zcat -f {} | gzip -nc > {}'
        cmd = cmd.format(' '.join(tas), pooled_ta)
        run_shell_cmd(cmd)
        return pooled_ta
    else:
        return make_hard_link(tas[0], out_dir)
Example #8
0
def tn5_shift_ta(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    shifted_ta = '{}.tn5.tagAlign.gz'.format(prefix)

    cmd = 'zcat -f {} | '
    cmd += 'awk \'BEGIN {{OFS = "\\t"}}'
    cmd += '{{ if ($6 == "+") {{$2 = $2 + 4}} '
    cmd += 'else if ($6 == "-") {{$3 = $3 - 5}} print $0}}\' | '
    cmd += 'gzip -nc > {}'
    cmd = cmd.format(ta, shifted_ta)
    run_shell_cmd(cmd)
    return shifted_ta
Example #9
0
def spr_pe(ta, pseudoreplication_random_seed, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) / 2 + 1) / 2)

    if pseudoreplication_random_seed == 0:
        random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
        log.info(
            'Using input file\'s size {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))
    else:
        random_seed = pseudoreplication_random_seed
        log.info(
            'Using a fixed integer {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))

    # bash-only
    run_shell_cmd('zcat -f {ta} | sed \'N;s/\\n/\\t/\' | '
                  'shuf --random-source=<(openssl enc -aes-256-ctr '
                  '-pass pass:{random_seed} -nosalt </dev/zero 2>/dev/null) | '
                  'split -d -l {nlines} - {prefix}.'.format(
                      ta=ta,
                      random_seed=random_seed,
                      nlines=nlines,
                      prefix=prefix,
                  ))

    run_shell_cmd('zcat -f {tmp_pr1} | '
                  'awk \'BEGIN{{OFS="\\t"}} '
                  '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
                  '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
                  '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
                  'gzip -nc > {ta_pr1}'.format(
                      tmp_pr1=tmp_pr1,
                      ta_pr1=ta_pr1,
                  ))

    run_shell_cmd('zcat -f {tmp_pr2} | '
                  'awk \'BEGIN{{OFS="\\t"}} '
                  '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
                  '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
                  '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
                  'gzip -nc > {ta_pr2}'.format(
                      tmp_pr2=tmp_pr2,
                      ta_pr2=ta_pr2,
                  ))

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def main():
    # read params
    args = parse_arguments()
    FINAL_BED = args.ta
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_ta(FINAL_BED)))

    DNASE = args.dnase if args.dnase and os.path.basename(
        args.dnase) != 'null' else ''
    BLACKLIST = args.blacklist if args.blacklist and os.path.basename(
        args.blacklist) != 'null' else ''
    PROM = args.prom if args.prom and os.path.basename(
        args.prom) != 'null' else ''
    ENH = args.enh if args.enh and os.path.basename(args.enh) != 'null' else ''

    result = []
    # Dnase regions
    if DNASE:
        reads_dnase, fract_dnase = get_fract_reads_in_regions(FINAL_BED, DNASE)
        result.append(('fraction_of_reads_in_universal_DHS_regions',
                       str(reads_dnase), str(fract_dnase)))

    # Blacklist regions
    if BLACKLIST:
        reads_blacklist, \
            fract_blacklist = get_fract_reads_in_regions(FINAL_BED, BLACKLIST)
        result.append(('fraction_of_reads_in_blacklist_regions',
                       str(reads_blacklist), str(fract_blacklist)))

    # Prom regions
    if PROM:
        reads_prom, fract_prom = get_fract_reads_in_regions(FINAL_BED, PROM)
        result.append(
            ('fraction_of_reads_in_promoter_regions', str(reads_prom),
             str(fract_prom)))

    # Enh regions
    if ENH:
        reads_enh, fract_enh = get_fract_reads_in_regions(FINAL_BED, ENH)
        result.append(('fraction_of_reads_in_enhancer_regions', str(reads_enh),
                       str(fract_enh)))

    annot_enrich_qc = OUTPUT_PREFIX + '.annot_enrich.qc'
    with open(annot_enrich_qc, 'w') as fp:
        for line in result:
            fp.write('\t'.join(line) + '\n')

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, mem_gb,
          out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    # temporary files
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    run_shell_cmd('macs2 callpeak '
                  '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
                  '--shift {shiftsize} --extsize {extsize} '
                  '--nomodel -B --SPMR --keep-dup all --call-summits'.format(
                      ta=ta,
                      prefix=prefix,
                      gensz=gensz,
                      pval_thresh=pval_thresh,
                      shiftsize=shiftsize,
                      extsize=smooth_win,
                  ))

    run_shell_cmd(
        'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}"_peaks.narrowPeak | '
        'awk \'BEGIN{{OFS="\\t"}}'
        '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
        '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            prefix=prefix,
            npeak_tmp=npeak_tmp,
        ))

    run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format(
        cap_num_peak=cap_num_peak,
        npeak_tmp=npeak_tmp,
        npeak_tmp2=npeak_tmp2,
    ))

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return npeak
def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_ta(ta)))
    ta_subsampled = '{}.{}{}{}tagAlign.gz'.format(
        prefix,
        'no_chrM.' if non_mito else '',
        'R1.' if r1_only else '',
        '{}.'.format(human_readable_number(subsample)) if subsample > 0 else ''
    )
    ta_tmp = '{}.tagAlign.tmp'.format(prefix)

    cmd0 = 'zcat -f {} | '
    if non_mito:
        # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | '
        cmd0 += 'grep -v \'^'+mito_chr_name+'\\b\' | '
    cmd0 += 'sed \'N;s/\\n/\\t/\' '
    if subsample > 0:
        cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr '
        cmd0 += '-pass pass:$(zcat -f {} | wc -c) -nosalt '
        cmd0 += '</dev/zero 2>/dev/null) > {}'
        cmd0 = cmd0.format(
            ta,
            subsample,
            ta,
            ta_tmp)
    else:
        cmd0 += '> {}'
        cmd0 = cmd0.format(
            ta,
            ta_tmp)

    run_shell_cmd(cmd0)

    cmd = 'cat {} | '
    cmd += 'awk \'BEGIN{{OFS="\\t"}} '
    if r1_only:
        cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
        cmd += '",$1,$2,$3,$4,$5,$6}}\' | '
    else:
        cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
        cmd += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
        cmd += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd += 'gzip -nc > {}'
    cmd = cmd.format(
        ta_tmp,
        ta_subsampled)
    run_shell_cmd(cmd)
    rm_f(ta_tmp)
    return ta_subsampled
Example #13
0
def count_signal_track(ta, chrsz, mem_gb, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    pos_bw = '{}.positive.bigwig'.format(prefix)
    neg_bw = '{}.negative.bigwig'.format(prefix)
    # temporary files
    pos_bedgraph = '{}.positive.bedgraph'.format(prefix)
    neg_bedgraph = '{}.negative.bedgraph'.format(prefix)

    temp_files = []

    run_shell_cmd(
        'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | '
        'bedtools genomecov -5 -bg -strand + -g {chrsz} -i stdin > {pos_bedgraph}'
        .format(
            ta=ta,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            chrsz=chrsz,
            pos_bedgraph=pos_bedgraph,
        ))

    run_shell_cmd(
        'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | '
        'bedtools genomecov -5 -bg -strand - -g {chrsz} -i stdin > {neg_bedgraph}'
        .format(
            ta=ta,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            chrsz=chrsz,
            neg_bedgraph=neg_bedgraph,
        ))

    run_shell_cmd('bedGraphToBigWig {pos_bedgraph} {chrsz} {pos_bw}'.format(
        pos_bedgraph=pos_bedgraph,
        chrsz=chrsz,
        pos_bw=pos_bw,
    ))

    run_shell_cmd('bedGraphToBigWig {neg_bedgraph} {chrsz} {neg_bw}'.format(
        neg_bedgraph=neg_bedgraph,
        chrsz=chrsz,
        neg_bw=neg_bw,
    ))

    # remove temporary files
    temp_files.append(pos_bedgraph)
    temp_files.append(neg_bedgraph)
    rm_f(temp_files)

    return pos_bw, neg_bw
Example #14
0
def pool_ta(tas, col, basename_prefix, out_dir):
    if len(tas) > 1:
        if basename_prefix is not None:
            prefix = os.path.join(out_dir, basename_prefix)
        else:
            prefix = os.path.join(out_dir,
                                  os.path.basename(strip_ext_ta(tas[0])))
        pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix)

        cmd = 'zcat -f {} | '
        if col is not None:
            cmd += 'cut -f 1-{} | '.format(col)
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(' '.join(tas), pooled_ta)
        run_shell_cmd(cmd)
        return pooled_ta
    else:
        raise ValueError('Needs at least two TAs (or BEDs) to be pooled.')
def pool_ta(tas, col, basename_prefix, out_dir):
    if len(tas) > 1:
        if basename_prefix is not None:
            prefix = os.path.join(out_dir,'basename_prefix')
        else:
            prefix = os.path.join(out_dir,
                              os.path.basename(strip_ext_ta(tas[0])))
        pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix)

        cmd = 'zcat -f {} | '
        if col is not None:
            cmd += 'cut -f 1-{} | '.format(col)
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            ' '.join(tas),
            pooled_ta)
        run_shell_cmd(cmd)
        return pooled_ta
    else:
        return make_hard_link(tas[0], out_dir)
Example #16
0
def xcor(ta,
         speak,
         mito_chr_name,
         nth,
         out_dir,
         chip_seq_type=None,
         exclusion_range_min=None,
         exclusion_range_max=None):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    xcor_plot_pdf = '{}.cc.plot.pdf'.format(prefix)
    xcor_score = '{}.cc.qc'.format(prefix)
    fraglen_txt = '{}.cc.fraglen.txt'.format(prefix)

    if chip_seq_type is not None and exclusion_range_min is not None:
        if exclusion_range_max is None:
            exclusion_range_max = get_exclusion_range_max(ta, chip_seq_type)

        exclusion_range_param = ' -x={}:{}'.format(exclusion_range_min,
                                                   exclusion_range_max)
    else:
        exclusion_range_param = ''

    cmd1 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -rf -c={} -p={} '
    cmd1 += '-filtchr="{}" -savp={} -out={} {}'
    cmd1 += exclusion_range_param
    cmd1 = cmd1.format(ta, nth, mito_chr_name, xcor_plot_pdf, xcor_score,
                       '-speak={}'.format(speak) if speak >= 0 else '')
    run_shell_cmd(cmd1)

    cmd2 = 'sed -r \'s/,[^\\t]+//g\' -i {}'
    cmd2 = cmd2.format(xcor_score)
    run_shell_cmd(cmd2)

    # parse xcor_score and write fraglen (3rd column) to file
    cmd3 = 'echo {} > {}'.format(
        parse_xcor_score(xcor_score)['estimated_fragment_len'], fraglen_txt)
    run_shell_cmd(cmd3)

    xcor_plot_png = pdf2png(xcor_plot_pdf, out_dir)
    return xcor_plot_pdf, xcor_plot_png, xcor_score, fraglen_txt
Example #17
0
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    # temporary files
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
Example #18
0
def spr_se(ta, pseudoreplication_random_seed, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) + 1) / 2)

    if pseudoreplication_random_seed == 0:
        random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
        log.info(
            'Using input file\'s size {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))
    else:
        random_seed = pseudoreplication_random_seed
        log.info(
            'Using a fixed integer {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))

    # bash-only
    run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc '
                  '-aes-256-ctr -pass pass:{random_seed} '
                  '-nosalt </dev/zero 2>/dev/null) | '
                  'split -d -l {nlines} - {prefix}.'.format(
                      ta=ta,
                      random_seed=random_seed,
                      nlines=nlines,
                      prefix=prefix,
                  ))

    run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1,
                                                         ta_pr1=ta_pr1))
    run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2,
                                                         ta_pr2=ta_pr2))

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def spr_pe(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) / 2 + 1) / 2)

    # bash-only
    cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | '
    cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr '
    cmd1 += '-pass pass:$(zcat -f {} | wc -c) '
    cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
    cmd1 += 'split -d -l {} - {}.'
    cmd1 = cmd1.format(ta, ta, nlines, prefix)
    run_shell_cmd(cmd1)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd2 += 'gzip -nc > {}'
    cmd2 = cmd2.format(tmp_pr1, ta_pr1)
    run_shell_cmd(cmd2)

    cmd3 = 'zcat -f {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd3 += 'gzip -nc > {}'
    cmd3 = cmd3.format(tmp_pr2, ta_pr2)
    run_shell_cmd(cmd3)

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
Example #20
0
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, mem_gb,
                       out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    run_shell_cmd('macs2 callpeak '
                  '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
                  '--shift {shiftsize} --extsize {extsize} '
                  '--nomodel -B --SPMR '
                  '--keep-dup all --call-summits '.format(
                      ta=ta,
                      prefix=prefix,
                      gensz=gensz,
                      pval_thresh=pval_thresh,
                      shiftsize=shiftsize,
                      extsize=smooth_win,
                  ))

    run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg '
                  '-c "{prefix}"_control_lambda.bdg '
                  '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, ))

    run_shell_cmd('bedtools slop -i "{prefix}"_FE.bdg -g {chrsz} -b 0 | '
                  'bedClip stdin {chrsz} {fc_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      fc_bedgraph=fc_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '
        '|| prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'.
        format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
               fc_bedgraph=fc_bedgraph,
               fc_bedgraph_srt=fc_bedgraph_srt))
    rm_f(fc_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format(
            fc_bedgraph_srt=fc_bedgraph_srt,
            chrsz=chrsz,
            fc_bigwig=fc_bigwig,
        ))
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg '
                  '-c "{prefix}"_control_lambda.bdg '
                  '--o-prefix {prefix} -m ppois -S {sval}'.format(
                      prefix=prefix,
                      sval=sval,
                  ))

    run_shell_cmd('bedtools slop -i "{prefix}"_ppois.bdg -g {chrsz} -b 0 | '
                  'bedClip stdin {chrsz} {pval_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      pval_bedgraph=pval_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '
        '|| prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'.
        format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pval_bedgraph=pval_bedgraph,
            pval_bedgraph_srt=pval_bedgraph_srt,
        ))
    rm_f(pval_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format(
            pval_bedgraph_srt=pval_bedgraph_srt,
            chrsz=chrsz,
            pval_bigwig=pval_bigwig,
        ))
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, ))

    run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {fc_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      fc_bedgraph=fc_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'.
        format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
               fc_bedgraph=fc_bedgraph,
               fc_bedgraph_srt=fc_bedgraph_srt))
    rm_f(fc_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format(
            fc_bedgraph_srt=fc_bedgraph_srt,
            chrsz=chrsz,
            fc_bigwig=fc_bigwig,
        ))
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix {prefix} -m ppois -S {sval}'.format(
                      prefix=prefix,
                      sval=sval,
                  ))

    run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {pval_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      pval_bedgraph=pval_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'.
        format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pval_bedgraph=pval_bedgraph,
            pval_bedgraph_srt=pval_bedgraph_srt,
        ))
    rm_f(pval_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format(
            pval_bedgraph_srt=pval_bedgraph_srt,
            chrsz=chrsz,
            pval_bigwig=pval_bigwig))
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
Example #23
0
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
Example #24
0
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd(
        'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | '
        'awk \'BEGIN{{OFS="\\t"}}'
        '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
        '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            prefix=prefix,
            npeak_tmp=npeak_tmp,
        ))

    run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format(
        cap_num_peak=cap_num_peak,
        npeak_tmp=npeak_tmp,
        npeak_tmp2=npeak_tmp2,
    ))

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return npeak