def spp(ta, ctl_ta, fraglen, cap_num_peak, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth < 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format(prefix, human_readable_number(cap_num_peak)) rpeak_tmp = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -rf {}' cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' | ' cmd1 += 'gzip -f -nc > {}' cmd1 = cmd1.format(rpeak_tmp, rpeak) run_shell_cmd(cmd1) rm_f([rpeak_tmp, rpeak_tmp_gz]) return rpeak
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample, ctl_paired_end, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth >= 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format(prefix, human_readable_number(cap_num_peak)) rpeak_tmp_prefix = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) rpeak_tmp2 = '{}.tmp2'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}' cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}' cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2) run_shell_cmd(cmd1) rm_f(rpeak_tmp_gz) # clip peaks between 0-chromSize. bed_clip(rpeak_tmp2, chrsz, rpeak) rm_f(rpeak_tmp2) return rpeak
def spr_se(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) + 1) / 2) # bash-only cmd1 = 'zcat {} | shuf --random-source=<(openssl enc ' cmd1 += '-aes-256-ctr -pass pass:$(zcat -f {} | wc -c) ' cmd1 += '-nosalt </dev/zero 2>/dev/null) | ' cmd1 += 'split -d -l {} - {}.' cmd1 = cmd1.format(ta, ta, nlines, prefix) run_shell_cmd(cmd1) cmd2 = 'gzip -nc {} > {}' cmd2 = cmd2.format(tmp_pr1, ta_pr1) run_shell_cmd(cmd2) cmd3 = 'gzip -nc {} > {}' cmd3 = cmd3.format(tmp_pr2, ta_pr2) run_shell_cmd(cmd3) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def count_signal_track(ta, chrsz, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) pos_bw = '{}.positive.bigwig'.format(prefix) neg_bw = '{}.negative.bigwig'.format(prefix) # temporary files pos_bedgraph = '{}.positive.bedgraph'.format(prefix) neg_bedgraph = '{}.negative.bedgraph'.format(prefix) temp_files = [] cmd1 = 'zcat -f {} | sort -k1,1 -k2,2n | ' cmd1 += 'bedtools genomecov -5 -bg -strand + -g {} -i stdin > {}' cmd1 = cmd1.format(ta, chrsz, pos_bedgraph) run_shell_cmd(cmd1) cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | ' cmd2 += 'bedtools genomecov -5 -bg -strand - -g {} -i stdin > {}' cmd2 = cmd2.format(ta, chrsz, neg_bedgraph) run_shell_cmd(cmd2) cmd3 = 'bedGraphToBigWig {} {} {}' cmd3 = cmd3.format(pos_bedgraph, chrsz, pos_bw) run_shell_cmd(cmd3) cmd4 = 'bedGraphToBigWig {} {} {}' cmd4 = cmd4.format(neg_bedgraph, chrsz, neg_bw) run_shell_cmd(cmd4) # remove temporary files temp_files.append(pos_bedgraph) temp_files.append(neg_bedgraph) rm_f(temp_files) return pos_bw, neg_bw
def subsample_ta_se(ta, subsample, non_mito, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) ta_subsampled = '{}.{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', '{}.'.format(human_readable_number(subsample)) if subsample > 0 else '' ) # bash-only cmd = 'zcat -f {} | ' if non_mito: # cmd += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' cmd += 'grep -v \'^'+mito_chr_name+'\\b\' | ' if subsample > 0: cmd += 'shuf -n {} --random-source=<(openssl enc -aes-256-ctr ' cmd += '-pass pass:$(zcat -f {} | wc -c) -nosalt ' cmd += '</dev/zero 2>/dev/null) | ' cmd += 'gzip -nc > {}' cmd = cmd.format( ta, subsample, ta, ta_subsampled) else: cmd += 'gzip -nc > {}' cmd = cmd.format( ta, ta_subsampled) run_shell_cmd(cmd) return ta_subsampled
def pool_ta(tas, out_dir): if len(tas) > 1: prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(tas[0]))) pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix) cmd = 'zcat -f {} | gzip -nc > {}' cmd = cmd.format(' '.join(tas), pooled_ta) run_shell_cmd(cmd) return pooled_ta else: return make_hard_link(tas[0], out_dir)
def tn5_shift_ta(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) shifted_ta = '{}.tn5.tagAlign.gz'.format(prefix) cmd = 'zcat -f {} | ' cmd += 'awk \'BEGIN {{OFS = "\\t"}}' cmd += '{{ if ($6 == "+") {{$2 = $2 + 4}} ' cmd += 'else if ($6 == "-") {{$3 = $3 - 5}} print $0}}\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format(ta, shifted_ta) run_shell_cmd(cmd) return shifted_ta
def spr_pe(ta, pseudoreplication_random_seed, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) / 2 + 1) / 2) if pseudoreplication_random_seed == 0: random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta)) log.info( 'Using input file\'s size {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) else: random_seed = pseudoreplication_random_seed log.info( 'Using a fixed integer {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) # bash-only run_shell_cmd('zcat -f {ta} | sed \'N;s/\\n/\\t/\' | ' 'shuf --random-source=<(openssl enc -aes-256-ctr ' '-pass pass:{random_seed} -nosalt </dev/zero 2>/dev/null) | ' 'split -d -l {nlines} - {prefix}.'.format( ta=ta, random_seed=random_seed, nlines=nlines, prefix=prefix, )) run_shell_cmd('zcat -f {tmp_pr1} | ' 'awk \'BEGIN{{OFS="\\t"}} ' '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' 'gzip -nc > {ta_pr1}'.format( tmp_pr1=tmp_pr1, ta_pr1=ta_pr1, )) run_shell_cmd('zcat -f {tmp_pr2} | ' 'awk \'BEGIN{{OFS="\\t"}} ' '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' 'gzip -nc > {ta_pr2}'.format( tmp_pr2=tmp_pr2, ta_pr2=ta_pr2, )) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def main(): # read params args = parse_arguments() FINAL_BED = args.ta OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_ta(FINAL_BED))) DNASE = args.dnase if args.dnase and os.path.basename( args.dnase) != 'null' else '' BLACKLIST = args.blacklist if args.blacklist and os.path.basename( args.blacklist) != 'null' else '' PROM = args.prom if args.prom and os.path.basename( args.prom) != 'null' else '' ENH = args.enh if args.enh and os.path.basename(args.enh) != 'null' else '' result = [] # Dnase regions if DNASE: reads_dnase, fract_dnase = get_fract_reads_in_regions(FINAL_BED, DNASE) result.append(('fraction_of_reads_in_universal_DHS_regions', str(reads_dnase), str(fract_dnase))) # Blacklist regions if BLACKLIST: reads_blacklist, \ fract_blacklist = get_fract_reads_in_regions(FINAL_BED, BLACKLIST) result.append(('fraction_of_reads_in_blacklist_regions', str(reads_blacklist), str(fract_blacklist))) # Prom regions if PROM: reads_prom, fract_prom = get_fract_reads_in_regions(FINAL_BED, PROM) result.append( ('fraction_of_reads_in_promoter_regions', str(reads_prom), str(fract_prom))) # Enh regions if ENH: reads_enh, fract_enh = get_fract_reads_in_regions(FINAL_BED, ENH) result.append(('fraction_of_reads_in_enhancer_regions', str(reads_enh), str(fract_enh))) annot_enrich_qc = OUTPUT_PREFIX + '.annot_enrich.qc' with open(annot_enrich_qc, 'w') as fp: for line in result: fp.write('\t'.join(line) + '\n') log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) # temporary files npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] run_shell_cmd('macs2 callpeak ' '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--shift {shiftsize} --extsize {extsize} ' '--nomodel -B --SPMR --keep-dup all --call-summits'.format( ta=ta, prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=shiftsize, extsize=smooth_win, )) run_shell_cmd( 'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}"_peaks.narrowPeak | ' 'awk \'BEGIN{{OFS="\\t"}}' '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), prefix=prefix, npeak_tmp=npeak_tmp, )) run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format( cap_num_peak=cap_num_peak, npeak_tmp=npeak_tmp, npeak_tmp2=npeak_tmp2, )) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return npeak
def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) ta_subsampled = '{}.{}{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', 'R1.' if r1_only else '', '{}.'.format(human_readable_number(subsample)) if subsample > 0 else '' ) ta_tmp = '{}.tagAlign.tmp'.format(prefix) cmd0 = 'zcat -f {} | ' if non_mito: # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' cmd0 += 'grep -v \'^'+mito_chr_name+'\\b\' | ' cmd0 += 'sed \'N;s/\\n/\\t/\' ' if subsample > 0: cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr ' cmd0 += '-pass pass:$(zcat -f {} | wc -c) -nosalt ' cmd0 += '</dev/zero 2>/dev/null) > {}' cmd0 = cmd0.format( ta, subsample, ta, ta_tmp) else: cmd0 += '> {}' cmd0 = cmd0.format( ta, ta_tmp) run_shell_cmd(cmd0) cmd = 'cat {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' if r1_only: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '",$1,$2,$3,$4,$5,$6}}\' | ' else: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format( ta_tmp, ta_subsampled) run_shell_cmd(cmd) rm_f(ta_tmp) return ta_subsampled
def count_signal_track(ta, chrsz, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) pos_bw = '{}.positive.bigwig'.format(prefix) neg_bw = '{}.negative.bigwig'.format(prefix) # temporary files pos_bedgraph = '{}.positive.bedgraph'.format(prefix) neg_bedgraph = '{}.negative.bedgraph'.format(prefix) temp_files = [] run_shell_cmd( 'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | ' 'bedtools genomecov -5 -bg -strand + -g {chrsz} -i stdin > {pos_bedgraph}' .format( ta=ta, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), chrsz=chrsz, pos_bedgraph=pos_bedgraph, )) run_shell_cmd( 'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | ' 'bedtools genomecov -5 -bg -strand - -g {chrsz} -i stdin > {neg_bedgraph}' .format( ta=ta, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), chrsz=chrsz, neg_bedgraph=neg_bedgraph, )) run_shell_cmd('bedGraphToBigWig {pos_bedgraph} {chrsz} {pos_bw}'.format( pos_bedgraph=pos_bedgraph, chrsz=chrsz, pos_bw=pos_bw, )) run_shell_cmd('bedGraphToBigWig {neg_bedgraph} {chrsz} {neg_bw}'.format( neg_bedgraph=neg_bedgraph, chrsz=chrsz, neg_bw=neg_bw, )) # remove temporary files temp_files.append(pos_bedgraph) temp_files.append(neg_bedgraph) rm_f(temp_files) return pos_bw, neg_bw
def pool_ta(tas, col, basename_prefix, out_dir): if len(tas) > 1: if basename_prefix is not None: prefix = os.path.join(out_dir, basename_prefix) else: prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(tas[0]))) pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix) cmd = 'zcat -f {} | ' if col is not None: cmd += 'cut -f 1-{} | '.format(col) cmd += 'gzip -nc > {}' cmd = cmd.format(' '.join(tas), pooled_ta) run_shell_cmd(cmd) return pooled_ta else: raise ValueError('Needs at least two TAs (or BEDs) to be pooled.')
def pool_ta(tas, col, basename_prefix, out_dir): if len(tas) > 1: if basename_prefix is not None: prefix = os.path.join(out_dir,'basename_prefix') else: prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(tas[0]))) pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix) cmd = 'zcat -f {} | ' if col is not None: cmd += 'cut -f 1-{} | '.format(col) cmd += 'gzip -nc > {}' cmd = cmd.format( ' '.join(tas), pooled_ta) run_shell_cmd(cmd) return pooled_ta else: return make_hard_link(tas[0], out_dir)
def xcor(ta, speak, mito_chr_name, nth, out_dir, chip_seq_type=None, exclusion_range_min=None, exclusion_range_max=None): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) xcor_plot_pdf = '{}.cc.plot.pdf'.format(prefix) xcor_score = '{}.cc.qc'.format(prefix) fraglen_txt = '{}.cc.fraglen.txt'.format(prefix) if chip_seq_type is not None and exclusion_range_min is not None: if exclusion_range_max is None: exclusion_range_max = get_exclusion_range_max(ta, chip_seq_type) exclusion_range_param = ' -x={}:{}'.format(exclusion_range_min, exclusion_range_max) else: exclusion_range_param = '' cmd1 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -rf -c={} -p={} ' cmd1 += '-filtchr="{}" -savp={} -out={} {}' cmd1 += exclusion_range_param cmd1 = cmd1.format(ta, nth, mito_chr_name, xcor_plot_pdf, xcor_score, '-speak={}'.format(speak) if speak >= 0 else '') run_shell_cmd(cmd1) cmd2 = 'sed -r \'s/,[^\\t]+//g\' -i {}' cmd2 = cmd2.format(xcor_score) run_shell_cmd(cmd2) # parse xcor_score and write fraglen (3rd column) to file cmd3 = 'echo {} > {}'.format( parse_xcor_score(xcor_score)['estimated_fragment_len'], fraglen_txt) run_shell_cmd(cmd3) xcor_plot_png = pdf2png(xcor_plot_pdf, out_dir) return xcor_plot_pdf, xcor_plot_png, xcor_score, fraglen_txt
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) # temporary files npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def spr_se(ta, pseudoreplication_random_seed, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) + 1) / 2) if pseudoreplication_random_seed == 0: random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta)) log.info( 'Using input file\'s size {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) else: random_seed = pseudoreplication_random_seed log.info( 'Using a fixed integer {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) # bash-only run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc ' '-aes-256-ctr -pass pass:{random_seed} ' '-nosalt </dev/zero 2>/dev/null) | ' 'split -d -l {nlines} - {prefix}.'.format( ta=ta, random_seed=random_seed, nlines=nlines, prefix=prefix, )) run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1, ta_pr1=ta_pr1)) run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2, ta_pr2=ta_pr2)) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def spr_pe(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) / 2 + 1) / 2) # bash-only cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | ' cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr ' cmd1 += '-pass pass:$(zcat -f {} | wc -c) ' cmd1 += '-nosalt </dev/zero 2>/dev/null) | ' cmd1 += 'split -d -l {} - {}.' cmd1 = cmd1.format(ta, ta, nlines, prefix) run_shell_cmd(cmd1) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format(tmp_pr1, ta_pr1) run_shell_cmd(cmd2) cmd3 = 'zcat -f {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd3 += 'gzip -nc > {}' cmd3 = cmd3.format(tmp_pr2, ta_pr2) run_shell_cmd(cmd3) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] run_shell_cmd('macs2 callpeak ' '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--shift {shiftsize} --extsize {extsize} ' '--nomodel -B --SPMR ' '--keep-dup all --call-summits '.format( ta=ta, prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=shiftsize, extsize=smooth_win, )) run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg ' '-c "{prefix}"_control_lambda.bdg ' '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, )) run_shell_cmd('bedtools slop -i "{prefix}"_FE.bdg -g {chrsz} -b 0 | ' 'bedClip stdin {chrsz} {fc_bedgraph}'.format( prefix=prefix, chrsz=chrsz, fc_bedgraph=fc_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 ' '|| prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'. format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), fc_bedgraph=fc_bedgraph, fc_bedgraph_srt=fc_bedgraph_srt)) rm_f(fc_bedgraph) run_shell_cmd( 'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format( fc_bedgraph_srt=fc_bedgraph_srt, chrsz=chrsz, fc_bigwig=fc_bigwig, )) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg ' '-c "{prefix}"_control_lambda.bdg ' '--o-prefix {prefix} -m ppois -S {sval}'.format( prefix=prefix, sval=sval, )) run_shell_cmd('bedtools slop -i "{prefix}"_ppois.bdg -g {chrsz} -b 0 | ' 'bedClip stdin {chrsz} {pval_bedgraph}'.format( prefix=prefix, chrsz=chrsz, pval_bedgraph=pval_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 ' '|| prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'. format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pval_bedgraph=pval_bedgraph, pval_bedgraph_srt=pval_bedgraph_srt, )) rm_f(pval_bedgraph) run_shell_cmd( 'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format( pval_bedgraph_srt=pval_bedgraph_srt, chrsz=chrsz, pval_bigwig=pval_bigwig, )) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, )) run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {fc_bedgraph}'.format( prefix=prefix, chrsz=chrsz, fc_bedgraph=fc_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'. format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), fc_bedgraph=fc_bedgraph, fc_bedgraph_srt=fc_bedgraph_srt)) rm_f(fc_bedgraph) run_shell_cmd( 'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format( fc_bedgraph_srt=fc_bedgraph_srt, chrsz=chrsz, fc_bigwig=fc_bigwig, )) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix {prefix} -m ppois -S {sval}'.format( prefix=prefix, sval=sval, )) run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {pval_bedgraph}'.format( prefix=prefix, chrsz=chrsz, pval_bedgraph=pval_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'. format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pval_bedgraph=pval_bedgraph, pval_bedgraph_srt=pval_bedgraph_srt, )) rm_f(pval_bedgraph) run_shell_cmd( 'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format( pval_bedgraph_srt=pval_bedgraph_srt, chrsz=chrsz, pval_bigwig=pval_bigwig)) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd( 'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | ' 'awk \'BEGIN{{OFS="\\t"}}' '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), prefix=prefix, npeak_tmp=npeak_tmp, )) run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format( cap_num_peak=cap_num_peak, npeak_tmp=npeak_tmp, npeak_tmp2=npeak_tmp2, )) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return npeak