def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # declare temp arrays temp_files = [] # files to deleted later at the end log.info('Subsampling TAGALIGN for xcor...') if args.paired_end: ta_subsampled = subsample_ta_pe(args.ta, args.subsample, True, args.mito_chr_name, True, args.out_dir) else: ta_subsampled = subsample_ta_se(args.ta, args.subsample, True, args.mito_chr_name, args.out_dir) temp_files.append(ta_subsampled) log.info('Cross-correlation analysis...') xcor_plot_pdf, xcor_plot_png, xcor_score, fraglen_txt = xcor( ta_subsampled, args.speak, args.mito_chr_name, args.nth, args.out_dir, args.chip_seq_type, args.exclusion_range_min, args.exclusion_range_max) log.info('Removing temporary files...') rm_f(temp_files) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) if args.paired_end: subsampled_ta = subsample_ta_pe(args.ta, args.subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=args.out_dir) else: subsampled_ta = subsample_ta_se(args.ta, args.subsample, non_mito=False, mito_chr_name=None, out_dir=args.out_dir) log.info('Checking if output is empty...') assert_file_not_empty(subsampled_ta) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample, ctl_paired_end, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth >= 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format(prefix, human_readable_number(cap_num_peak)) rpeak_tmp_prefix = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) rpeak_tmp2 = '{}.tmp2'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}' cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}' cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2) run_shell_cmd(cmd1) rm_f(rpeak_tmp_gz) # clip peaks between 0-chromSize. bed_clip(rpeak_tmp2, chrsz, rpeak) rm_f(rpeak_tmp2) return rpeak
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # declare temp arrays temp_files = [] # files to deleted later at the end log.info('Converting BAM to TAGALIGN...') if args.paired_end: ta = bam2ta_pe(args.bam, args.nth, args.out_dir) else: ta = bam2ta_se(args.bam, args.out_dir) if args.subsample: log.info('Subsampling TAGALIGN...') if args.paired_end: subsampled_ta = subsample_ta_pe(ta, args.subsample, False, args.mito_chr_name, False, args.out_dir) else: subsampled_ta = subsample_ta_se(ta, args.subsample, False, args.mito_chr_name, args.out_dir) temp_files.append(ta) else: subsampled_ta = ta if args.disable_tn5_shift: shifted_ta = subsampled_ta else: log.info("TN5-shifting TAGALIGN...") shifted_ta = tn5_shift_ta(subsampled_ta, args.out_dir) temp_files.append(subsampled_ta) log.info('Checking if output is empty...') assert_file_not_empty(shifted_ta) log.info('Removing temporary files...') rm_f(temp_files) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd( 'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | ' 'awk \'BEGIN{{OFS="\\t"}}' '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), prefix=prefix, npeak_tmp=npeak_tmp, )) run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format( cap_num_peak=cap_num_peak, npeak_tmp=npeak_tmp, npeak_tmp2=npeak_tmp2, )) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return npeak
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, )) run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {fc_bedgraph}'.format( prefix=prefix, chrsz=chrsz, fc_bedgraph=fc_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'. format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), fc_bedgraph=fc_bedgraph, fc_bedgraph_srt=fc_bedgraph_srt)) rm_f(fc_bedgraph) run_shell_cmd( 'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format( fc_bedgraph_srt=fc_bedgraph_srt, chrsz=chrsz, fc_bigwig=fc_bigwig, )) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix {prefix} -m ppois -S {sval}'.format( prefix=prefix, sval=sval, )) run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {pval_bedgraph}'.format( prefix=prefix, chrsz=chrsz, pval_bedgraph=pval_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'. format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pval_bedgraph=pval_bedgraph, pval_bedgraph_srt=pval_bedgraph_srt, )) rm_f(pval_bedgraph) run_shell_cmd( 'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format( pval_bedgraph_srt=pval_bedgraph_srt, chrsz=chrsz, pval_bigwig=pval_bigwig)) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig