def naive_overlap(basename_prefix, peak1, peak2, peak_pooled, peak_type, nonamecheck, out_dir): prefix = os.path.join(out_dir, basename_prefix) prefix += '.overlap' overlap_peak = '{}.{}.gz'.format(prefix, peak_type) nonamecheck_param = '-nonamecheck' if nonamecheck else '' # narrowpeak, regionpeak only awk_param = '{s1=$3-$2; s2=$13-$12; ' awk_param += 'if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}' cut_param = '1-10' # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(peak1, 'tmp1', out_dir) tmp2 = gunzip(peak2, 'tmp2', out_dir) tmp_pooled = gunzip(peak_pooled, 'tmp_pooled', out_dir) # Find pooled peaks that overlap peak1 and peak2 # where overlap is defined as the fractional overlap # wrt any one of the overlapping peak pairs >= 0.5 cmd1 = 'intersectBed {} -wo ' cmd1 += '-a {} -b {} | ' cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | ' cmd1 += 'cut -f {} | sort | uniq | ' cmd1 += 'intersectBed {} -wo ' cmd1 += '-a stdin -b {} | ' cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | ' cmd1 += 'cut -f {} | sort | uniq | gzip -nc > {}' cmd1 = cmd1.format( nonamecheck_param, tmp_pooled, # peak_pooled tmp1, # peak1 awk_param, cut_param, nonamecheck_param, tmp2, # peak2 awk_param, cut_param, overlap_peak) run_shell_cmd(cmd1) rm_f([tmp1, tmp2, tmp_pooled]) return overlap_peak
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) if multimapping: qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir) run_shell_cmd( 'samtools view -h {qname_sort_bam} | ' '$(which assign_multimappers.py) -k {multimapping} | ' 'samtools view -F 1804 -Su /dev/stdin | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( qname_sort_bam=qname_sort_bam, multimapping=multimapping, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(qname_sort_bam) # remove temporary files else: run_shell_cmd( 'samtools view -F 1804 -q {mapq_thresh} -u {bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( mapq_thresh=mapq_thresh, bam=bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) return filt_bam
def idr(basename_prefix, peak1, peak2, peak_pooled, peak_type, thresh, rank, out_dir): prefix = os.path.join(out_dir, basename_prefix) prefix += '.idr{}'.format(thresh) idr_peak = '{}.{}.gz'.format(prefix, peak_type) idr_out_gz = '{}.unthresholded-peaks.txt.gz'.format(prefix) idr_plot = '{}.unthresholded-peaks.txt.png'.format(prefix) idr_stdout = '{}.log'.format(prefix) # temporary idr_12col_bed = '{}.12-col.bed.gz'.format(peak_type) idr_out = '{}.unthresholded-peaks.txt'.format(prefix) cmd1 = 'idr --samples {} {} --peak-list {} --input-file-type narrowPeak ' cmd1 += '--output-file {} --rank {} --soft-idr-threshold {} ' cmd1 += '--plot --use-best-multisummit-IDR --log-output-file {}' cmd1 = cmd1.format(peak1, peak2, peak_pooled, idr_out, rank, thresh, idr_stdout) run_shell_cmd(cmd1) col = get_npeak_col_by_rank(rank) neg_log10_thresh = -math.log10(thresh) # LC_COLLATE=C cmd2 = 'awk \'BEGIN{{OFS="\\t"}} $12>={} ' cmd2 += '{{if ($2<0) $2=0; ' cmd2 += 'print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' {} ' cmd2 += '| sort | uniq | sort -grk{},{} | gzip -nc > {}' cmd2 = cmd2.format(neg_log10_thresh, idr_out, col, col, idr_12col_bed) run_shell_cmd(cmd2) cmd3 = 'zcat {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd3 += '{{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}}\' | ' cmd3 += 'gzip -nc > {}' cmd3 = cmd3.format(idr_12col_bed, idr_peak) run_shell_cmd(cmd3) cmd4 = 'gzip -f {}'.format(idr_out) run_shell_cmd(cmd4) rm_f([idr_out, idr_12col_bed]) rm_f('{}.*.noalternatesummitpeaks.png'.format(prefix)) return idr_peak, idr_plot, idr_out_gz, idr_stdout
def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) if subsample % 2: raise ValueError( 'Number of reads to subsample should be an even number ' 'for paired end TAG-ALIGN (BED) file. n={n}'.format(n=subsample)) ta_subsampled = '{}.{}{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', 'R1.' if r1_only else '', '{}.'.format( human_readable_number(subsample)) if subsample > 0 else '') ta_tmp = '{}.tagAlign.tmp'.format(prefix) cmd0 = 'zcat -f {} | ' if non_mito: # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' cmd0 += 'grep -v \'^' + mito_chr_name + '\\b\' | ' cmd0 += 'sed \'N;s/\\n/\\t/\' ' if subsample > 0: cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr ' cmd0 += '-pass pass:$(zcat -f {} | wc -c) -nosalt ' cmd0 += '</dev/zero 2>/dev/null) > {}' cmd0 = cmd0.format(ta, int(subsample / 2), ta, ta_tmp) else: cmd0 += '> {}' cmd0 = cmd0.format(ta, ta_tmp) run_shell_cmd(cmd0) cmd = 'cat {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' if r1_only: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '",$1,$2,$3,$4,$5,$6}}\' | ' else: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format(ta_tmp, ta_subsampled) run_shell_cmd(cmd) rm_f(ta_tmp) return ta_subsampled
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} | gzip -nc > {}'.format(cap_num_peak, npeak_tmp, npeak) run_shell_cmd(cmd2) rm_f(npeak_tmp) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def main(): # read params args = parse_arguments() CHROMSIZES = args.chrsz TSS = args.tss if args.tss and os.path.basename(args.tss) != 'null' else '' FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) samtools_index(FINAL_BAM) # make an index first RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # Also get read length # read_len = get_read_length(FASTQ) if args.read_len_log: with open(args.read_len_log, 'r') as fp: read_len = int(fp.read().strip()) elif args.read_len: read_len = args.read_len else: read_len = None # Enrichments: V plot for enrichment # Use final to avoid duplicates tss_plot, tss_large_plot, tss_enrich_qc = \ make_tss_plot(FINAL_BAM, TSS, OUTPUT_PREFIX, CHROMSIZES, read_len) # remove temporary files rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def bowtie2_se(fastq, ref_index_prefix, multimapping, local, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) run_shell_cmd( 'bowtie2 {multimapping} {mode_param} --mm --threads {nth} -x {ref} ' '-U {fastq} | samtools view -1 -S /dev/stdin > {tmp_bam}'.format( multimapping='-k {mm}'.format(mm=multimapping + 1) if multimapping else '', mode_param='--local ' if local else '', nth=nth, ref=ref_index_prefix, fastq=fastq, tmp_bam=tmp_bam, )) bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir) rm_f(tmp_bam) return bam
def run_preseq(bam_w_dups, prefix, nth=1, mem_gb=None): ''' Runs preseq. Look at preseq data output to get PBC/NRF. ''' # First sort because this file no longer exists... sort_bam = samtools_sort(bam_w_dups, nth, mem_gb) logging.info('Running preseq...') preseq_data = '{0}.preseq.dat'.format(prefix) preseq_log = '{0}.preseq.log'.format(prefix) run_shell_cmd('preseq lc_extrap -P -B -o {preseq_data} {sort_bam} ' '-seed 1 -v 2> {preseq_log}'.format( preseq_data=preseq_data, sort_bam=sort_bam, preseq_log=preseq_log, )) rm_f(sort_bam) return preseq_data, preseq_log
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) if multimapping: qname_sort_bam = samtools_name_sort(bam, nth, out_dir) cmd2 = 'samtools view -h {} | ' cmd2 += '$(which assign_multimappers.py) -k {} | ' cmd2 += 'samtools view -F 1804 -Su /dev/stdin | ' cmd2 += 'samtools sort /dev/stdin -o {} -T {} -@ {}' cmd2 = cmd2.format(qname_sort_bam, multimapping, filt_bam, prefix, nth) run_shell_cmd(cmd2) rm_f(qname_sort_bam) # remove temporary files else: cmd = 'samtools view -F 1804 -q {} -u {} | ' cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}' cmd = cmd.format(mapq_thresh, bam, filt_bam, prefix, nth) run_shell_cmd(cmd) return filt_bam
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) # temporary files npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def main(): # read params args = parse_arguments() REF = args.ref_fa FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) JAVA_HEAP = args.picard_java_heap gc_out, gc_plot_pdf, gc_summary = get_gc(RG_FREE_FINAL_BAM, REF, OUTPUT_PREFIX, JAVA_HEAP) # will generate PNG format from gc_out plot_gc(gc_out, OUTPUT_PREFIX) rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def bowtie2_pe(fastq1, fastq2, ref_index_prefix, multimapping, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) run_shell_cmd( 'bowtie2 {multimapping} -X2000 --mm --threads {nth} -x {ref} ' '-1 {fastq1} -2 {fastq2} | samtools view -1 -S /dev/stdin > {tmp_bam}'. format( multimapping='-k {mm}'.format(mm=multimapping + 1) if multimapping else '', nth=nth, ref=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, tmp_bam=tmp_bam, )) bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir) rm_f(tmp_bam) return bam
def pbc_qc_pe(bam, mito_chr_name, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbc_qc = '{}.lib_complexity.qc'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, out_dir) cmd3 = 'bedtools bamtobed -bedpe -i {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | ' cmd3 += 'grep -v "^{}\\s" | sort | uniq -c | ' cmd3 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd3 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd3 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' cmd3 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' cmd3 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"' cmd3 += ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd3 = cmd3.format( nmsrt_bam, mito_chr_name, pbc_qc) run_shell_cmd(cmd3) rm_f(nmsrt_bam) return pbc_qc
def main(): # read params args = parse_arguments() FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) # Insert size distribution - CAN'T GET THIS FOR SE FILES insert_data, insert_plot = get_insert_distribution(RG_FREE_FINAL_BAM, OUTPUT_PREFIX) # Also need to run n-nucleosome estimation fragment_length_qc(read_picard_histogram(insert_data), OUTPUT_PREFIX) fragment_length_plot(insert_data, OUTPUT_PREFIX) rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def frip_shifted(ta, peak, chrsz, fraglen, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) half_fraglen = (fraglen + 1) / 2 if get_num_lines(peak) == 0: val1 = 0.0 else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools slop -i {} -g {} ' cmd += '-s -l {} -r {} | ' cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' cmd += '-wa -u | wc -l' cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2) # peak val1 = run_shell_cmd(cmd) rm_f(tmp2) val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) return frip_qc
def frip(ta, peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) if get_num_lines(peak) == 0: val1 = 0.0 tmp_files = [] else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(ta, 'tmp1', out_dir) tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' cmd = cmd.format( tmp1, # ta tmp2) # peak val1 = run_shell_cmd(cmd) tmp_files = [tmp1, tmp2] val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) rm_f(tmp_files) return frip_qc
def spp(ta, ctl_ta, fraglen, cap_num_peak, fdr_thresh, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth < 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format( prefix, human_readable_number(cap_num_peak)) rpeak_tmp = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}' cmd0 = cmd0.format( ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp, fdr_thresh, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' | ' cmd1 += 'gzip -f -nc > {}' cmd1 = cmd1.format( rpeak_tmp, rpeak) run_shell_cmd(cmd1) rm_f([rpeak_tmp, rpeak_tmp_gz]) return rpeak
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir) run_shell_cmd( 'bwa samse {ref} {sai} {fastq} | ' 'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format( ref=ref_index_prefix, sai=sai, fastq=fastq, res_param=get_samtools_res_param('view', nth=nth), tmp_bam=tmp_bam, )) rm_f(sai) bam = samtools_sort(tmp_bam, nth, mem_gb) rm_f(tmp_bam) return bam
def bed_clip(bed, chrsz, out_clipped_bed, no_gz=False): ''' Make sure that bedClip (in USCS tools) is installed. Clip a BED file between 0 and chromSize (taken from 2-col chrsz file). bedClip exits with 255 if both start/end coordinates are out of valid range (0-chromSize). Otherwise, reads/peaks will be truncated. Args: no_gz: Do not gzip output. ''' tmp_out = out_clipped_bed + '.clip_tmp' cmd = 'bedClip {bed} {chrsz} {tmp_out} -truncate -verbose=2'.format( bed=bed, chrsz=chrsz, tmp_out=out_clipped_bed if no_gz else tmp_out) run_shell_cmd(cmd) if not no_gz: cmd2 = 'cat {tmp_out} | gzip -nc > {out_clipped_bed}'.format( tmp_out=tmp_out, out_clipped_bed=out_clipped_bed) run_shell_cmd(cmd2) rm_f(tmp_out)
def pbc_qc_pe(bam, mito_chr_name, nth, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbc_qc = '{}.lib_complexity.qc'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, mem_gb, out_dir) run_shell_cmd( 'bedtools bamtobed -bedpe -i {nmsrt_bam} | ' 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | ' 'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | ' 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"' ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format( nmsrt_bam=nmsrt_bam, mito_chr_name=mito_chr_name, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pbc_qc=pbc_qc, )) rm_f(nmsrt_bam) return pbc_qc
def spr_pe(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) / 2 + 1) / 2) # bash-only cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | ' cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr ' cmd1 += '-pass pass:$(zcat -f {} | wc -c) ' cmd1 += '-nosalt </dev/zero 2>/dev/null) | ' cmd1 += 'split -d -l {} - {}.' cmd1 = cmd1.format(ta, ta, nlines, prefix) run_shell_cmd(cmd1) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format(tmp_pr1, ta_pr1) run_shell_cmd(cmd2) cmd3 = 'zcat -f {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd3 += 'gzip -nc > {}' cmd3 = cmd3.format(tmp_pr2, ta_pr2) run_shell_cmd(cmd3) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def spr_se(ta, pseudoreplication_random_seed, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) + 1) / 2) if pseudoreplication_random_seed == 0: random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta)) log.info( 'Using input file\'s size {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) else: random_seed = pseudoreplication_random_seed log.info( 'Using a fixed integer {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) # bash-only run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc ' '-aes-256-ctr -pass pass:{random_seed} ' '-nosalt </dev/zero 2>/dev/null) | ' 'split -d -l {nlines} - {prefix}.'.format( ta=ta, random_seed=random_seed, nlines=nlines, prefix=prefix, )) run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1, ta_pr1=ta_pr1)) run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2, ta_pr2=ta_pr2)) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def bam_to_pbam(bam, ref_fa, out_dir='.'): '''Convert BAM into pBAM. Requirements: - Python package `ptools_bin` - `samtools` ''' prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbam_tmp = '{}.sorted.p.bam'.format(prefix) pbam = '{}.p.bam'.format(prefix) temp_files = [] if ref_fa.endswith('.gz'): gunzipped_ref_fa = '{}.fasta'.format( os.path.join(out_dir, os.path.basename(strip_ext_gz(ref_fa)))) run_shell_cmd('zcat -f {ref_fa} > {gunzipped_ref_fa}'.format( ref_fa=ref_fa, gunzipped_ref_fa=gunzipped_ref_fa, )) temp_files.append(gunzipped_ref_fa) else: gunzipped_ref_fa = ref_fa run_shell_cmd('makepBAM_genome.sh {bam} {gunzipped_ref_fa}'.format( bam=bam, gunzipped_ref_fa=gunzipped_ref_fa, )) run_shell_cmd('mv {pbam_tmp} {pbam}'.format( pbam_tmp=pbam_tmp, pbam=pbam, )) rm_f(temp_files) return pbam
def bam2ta_pe(bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) ta = '{}.tagAlign.gz'.format(prefix) # intermediate files bedpe = '{}.bedpe.gz'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, out_dir) cmd1 = 'LC_COLLATE=C bedtools bamtobed -bedpe -mate1 -i {} | ' # cmd1 += 'sort -k1,1 -k2,2n -k3,3n | ' cmd1 += 'gzip -nc > {}' cmd1 = cmd1.format(nmsrt_bam, bedpe) run_shell_cmd(cmd1) rm_f(nmsrt_bam) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}}' cmd2 += '{{printf "%s\\t%s\\t%s\\tN\\t1000\\t%s\\n' cmd2 += '%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",' cmd2 += '$1,$2,$3,$9,$4,$5,$6,$10}}\' | ' cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format(bedpe, ta) run_shell_cmd(cmd2) rm_f(bedpe) return ta
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe, bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) log.info('Guessed read length of R1 FASTQ: {read_len}'.format( read_len=read_len, )) if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit: log.info('Use bwa mem.') cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format( extra_param='-P' if rescue_reads_for_bwa_mem else '', nth=nth, ref_index_prefix=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, sam=sam, ) temp_files.append(sam) else: log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.') sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: run_shell_cmd( 'zcat -f {sam} | grep -v -F -f {badcigar} | ' 'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, badcigar=badcigar, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) else: run_shell_cmd( 'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(temp_files) return bam
def peak_to_bigbed(peak, peak_type, chrsz, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) bigbed = '{}.{}.bb'.format(prefix, peak_type) as_file = '{}.as'.format(prefix) chrsz_tmp = '{}.chrsz.tmp'.format(prefix) bigbed_tmp = '{}.bb.tmp'.format(prefix) bigbed_tmp2 = '{}.bb.tmp2'.format(prefix) if peak_type.lower() == 'narrowpeak' or peak_type.lower() == 'regionpeak': as_file_contents = '''table narrowPeak "BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name given to a region (preferably unique). Use . if no name is assigned" uint score; "Indicates how dark the peak will be displayed in the browser (0-1000) " char[1] strand; "+ or - or . for unknown" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." int peak; "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called." ) ''' bed_param = '-type=bed6+4 -as={}'.format(as_file) elif peak_type.lower() == 'broadpeak': as_file_contents = '''table broadPeak "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name given to a region (preferably unique). Use . if no name is assigned." uint score; "Indicates how dark the peak will be displayed in the browser (0-1000)" char[1] strand; "+ or - or . for unknown" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." ) ''' bed_param = '-type=bed6+3 -as={}'.format(as_file) elif peak_type.lower() == 'gappedpeak': as_file_contents = '''table gappedPeak "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Pseudogene alignment start position" uint chromEnd; "Pseudogene alignment end position" string name; "Name of pseudogene" uint score; "Score of pseudogene with gene (0-1000)" char[1] strand; "+ or - or . for unknown" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Always zero for now" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used." ) ''' bed_param = '-type=bed12+3 -as={}'.format(as_file) else: raise Exception('Unsupported peak file type {}!'.format(peak_type)) # create temporary .as file with open(as_file, 'w') as fp: fp.write(as_file_contents) cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp) run_shell_cmd(cmd1) cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | " cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; ' cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}' cmd2 = cmd2.format(peak, bigbed_tmp) run_shell_cmd(cmd2) cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2) run_shell_cmd(cmd3) cmd4 = "bedToBigBed {} {} {} {}".format(bed_param, bigbed_tmp2, chrsz_tmp, bigbed) run_shell_cmd(cmd4) # remove temporary files rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2]) return bigbed
def peak_to_hammock(peak, out_dir): peak_type = get_peak_type(peak) prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak))) hammock = '{}.{}.hammock'.format(prefix, peak_type) hammock_tmp = '{}.tmp'.format(hammock) hammock_tmp2 = '{}.tmp2'.format(hammock) hammock_gz = '{}.gz'.format(hammock) hammock_gz_tbi = '{}.gz.tbi'.format(hammock) if get_num_lines(peak) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz) run_shell_cmd(cmd) cmd2 = 'touch {}'.format(hammock_gz_tbi) else: cmd = "zcat -f {} | " cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}" cmd = cmd.format(peak, hammock_tmp) run_shell_cmd(cmd) with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout: id = 1 for line in fin: lst = line.rstrip().split('\t') if peak_type == 'narrowPeak' or peak_type == 'regionPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},' '{0[8]}],id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') if lst[9] != '-1': fout.write('sbstroke:[' + lst[9] + ']') elif peak_type == 'gappedPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},' '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],' 'thick:['.format(lst, id)) a = int(lst[1]) sizes = lst[10].split(',') starts = lst[11].split(',') for i in range(len(sizes)): fout.write('[{0},{1}],'.format( a + int(starts[i]), a + int(starts[i]) + int(sizes[i]))) fout.write(']},') if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') elif peak_type == 'broadPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],' 'id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') else: raise Exception("Unsupported peak_type {}".format(peak)) id += 1 fout.write('\n') cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}' cmd2 = cmd2.format(hammock_tmp2, hammock_gz) run_shell_cmd(cmd2) cmd3 = 'tabix -f -p bed {}'.format(hammock_gz) run_shell_cmd(cmd3) rm_f([hammock, hammock_tmp, hammock_tmp2]) return (hammock_gz, hammock_gz_tbi)
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # declare temp arrays temp_files = [] # files to deleted later at the end log.info('Detecting adapters...') for i in range(len(args.fastqs)): # for each fastq to be merged later log.info('Detecting adapters for merge_id={}...'.format(i + 1)) fastqs = args.fastqs[i] # R1 and R2 adapters = args.adapters[i] if args.paired_end: if not args.adapter and args.auto_detect_adapter and \ not (adapters[0] and adapters[1]): args.adapters[i][0] = detect_most_likely_adapter(fastqs[0]) args.adapters[i][1] = detect_most_likely_adapter(fastqs[1]) log.info('Detected adapters for merge_id={}, ' 'R1: {}, R2: {}'.format(i + 1, args.adapters[i][0], args.adapters[i][1])) else: if not args.adapter and args.auto_detect_adapter and \ not adapters[0]: args.adapters[i][0] = detect_most_likely_adapter(fastqs[0]) log.info('Detected adapter for merge_id={}, R1: {}'.format( i + 1, args.adapters[i][0])) log.info('Trimming adapters...') trimmed_fastqs_R1 = [] trimmed_fastqs_R2 = [] for i in range(len(args.fastqs)): # for each fastq to be merged later fastqs = args.fastqs[i] # R1 and R2 adapters = args.adapters[i] if args.paired_end: fastqs = trim_adapter_pe(fastqs[0], fastqs[1], adapters[0], adapters[1], args.adapter, args.cutadapt_param, args.out_dir) trimmed_fastqs_R1.append(fastqs[0]) trimmed_fastqs_R2.append(fastqs[1]) else: fastq = trim_adapter_se(fastqs[0], adapters[0], args.adapter, args.cutadapt_param, args.out_dir) trimmed_fastqs_R1.append(fastq) log.info('Merging fastqs...') log.info('R1 to be merged: {}'.format(trimmed_fastqs_R1)) merge_fastqs(trimmed_fastqs_R1, 'R1', args.out_dir) if args.paired_end: log.info('R2 to be merged: {}'.format(trimmed_fastqs_R2)) merge_fastqs(trimmed_fastqs_R2, 'R2', args.out_dir) temp_files.extend(trimmed_fastqs_R1) temp_files.extend(trimmed_fastqs_R2) log.info('Removing temporary files...') rm_f(temp_files) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def main(): # filt_bam - dupmark_bam - nodup_bam # \ dup_qc \ pbc_qc # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # declare temp arrays temp_files = [] # files to deleted later at the end log.info('Removing unmapped/low-quality reads...') if args.paired_end: filt_bam = rm_unmapped_lowq_reads_pe(args.bam, args.multimapping, args.mapq_thresh, args.nth, args.mem_gb, args.out_dir) else: filt_bam = rm_unmapped_lowq_reads_se(args.bam, args.multimapping, args.mapq_thresh, args.nth, args.mem_gb, args.out_dir) log.info('Checking if filtered BAM file is empty...') if bam_is_empty(filt_bam, args.nth): help_msg = ( 'No reads found in filtered BAM. ' 'Low quality sample? ' 'Or no reads passing criteria "samtools view -F 1804"? ' 'Check samtools flags at ' 'https://broadinstitute.github.io/picard/explain-flags.html. ') if args.paired_end: help_msg += ( 'Or is this truely PE BAM? ' 'All unpaired SE reads could be removed by "samtools view -f 2". ' ) raise ValueError(help_msg) log.info('Marking dupes with {}...'.format(args.dup_marker)) if args.dup_marker == 'picard': dupmark_bam, dup_qc = mark_dup_picard(filt_bam, args.out_dir, args.picard_java_heap) elif args.dup_marker == 'sambamba': dupmark_bam, dup_qc = mark_dup_sambamba(filt_bam, args.nth, args.out_dir) else: raise argparse.ArgumentTypeError('Unsupported --dup-marker {}'.format( args.dup_marker)) if args.no_dup_removal: nodup_bam = filt_bam else: temp_files.append(filt_bam) log.info('Removing dupes...') if args.paired_end: nodup_bam = rm_dup_pe(dupmark_bam, args.nth, args.out_dir) else: nodup_bam = rm_dup_se(dupmark_bam, args.nth, args.out_dir) samtools_index(dupmark_bam) temp_files.append(dupmark_bam + '.bai') temp_files.append(dupmark_bam) if len(args.filter_chrs) > 0: final_bam = remove_chrs_from_bam(nodup_bam, args.filter_chrs, args.chrsz, args.nth, args.out_dir) temp_files.append(nodup_bam) else: final_bam = nodup_bam log.info('Checking if final BAM file is empty...') if bam_is_empty(final_bam, args.nth): raise ValueError('No reads found in final (filtered/deduped) BAM. ' 'Low quality sample? ' 'Or BAM with duplicates only? ') log.info('samtools index (final_bam)...') samtools_index(final_bam, args.nth, args.out_dir) log.info('samstat...') samstat(final_bam, args.nth, args.mem_gb, args.out_dir) log.info('Generating PBC QC log...') if args.paired_end: pbc_qc_pe(dupmark_bam, args.mito_chr_name, args.nth, args.out_dir) else: pbc_qc_se(dupmark_bam, args.mito_chr_name, args.out_dir) log.info('samtools index (raw bam)...') bam = copy_f_to_dir(args.bam, args.out_dir) bai = samtools_index(bam, args.nth, args.out_dir) temp_files.extend([bam, bai]) log.info('Removing temporary files...') rm_f(temp_files) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')