def main(): # read params args = parse_arguments() REF = args.ref_fa FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join( args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) JAVA_HEAP = args.picard_java_heap gc_out, gc_plot_pdf, gc_summary = get_gc(RG_FREE_FINAL_BAM, REF, OUTPUT_PREFIX, JAVA_HEAP) # will generate PNG format from gc_out plot_gc(gc_out, OUTPUT_PREFIX) rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def sambamba_name_sort(bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) nmsrt_bam = '{}.nmsrt.bam'.format(prefix) cmd = 'sambamba sort -n {} -o {} -t {}'.format(bam, nmsrt_bam, nth) run_shell_cmd(cmd) return nmsrt_bam
def main(): # read params args = parse_arguments() ALIGNED_BAM = args.bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(ALIGNED_BAM))) RG_FREE_ALIGNED_BAM = remove_read_group(ALIGNED_BAM) JAVA_HEAP = args.picard_java_heap # Library complexity: Preseq results, NRF, PBC1, PBC2 if args.paired_end: picard_est_lib_size = get_picard_complexity_metrics( RG_FREE_ALIGNED_BAM, OUTPUT_PREFIX, JAVA_HEAP) else: picard_est_lib_size = None preseq_data, preseq_log = run_preseq(ALIGNED_BAM, OUTPUT_PREFIX) # SORTED BAM get_preseq_plot(preseq_data, OUTPUT_PREFIX) # write picard_est_lib_size to file if picard_est_lib_size is not None: picard_est_lib_size_file = OUTPUT_PREFIX + '.picard_est_lib_size.qc' with open(picard_est_lib_size_file, 'w') as fp: fp.write(str(picard_est_lib_size) + '\n') rm_f(RG_FREE_ALIGNED_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def sambamba_flagstat(bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) flagstat_qc = '{}.flagstat.qc'.format(prefix) cmd = 'sambamba flagstat {} -t {} > {}'.format(bam, nth, flagstat_qc) run_shell_cmd(cmd) return flagstat_qc
def remove_chrs_from_bam(bam, chrs, chrsz, nth=1, out_dir=''): if len(chrs) == 0: raise ValueError('There must be at least one chromosome, zero found.') prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) suffix = 'no_{}'.format('_'.join(chrs)) final_bam = '{}.{}.bam'.format(prefix, suffix) tmp_chrsz = '{}.{}.tmp.chrsz'.format(prefix, suffix) # make a temp chrsz file cmd0 = 'zcat -f {chrsz} |' cmd0 += 'grep -v -P \'^({chrs})\\s\' | ' cmd0 += 'awk \'BEGIN{{OFS="\\t"}} {{print $1,0,$2}}\' > {tmp_chrsz}' cmd0 = cmd0.format(chrsz=chrsz, chrs='|'.join(chrs), tmp_chrsz=tmp_chrsz) run_shell_cmd(cmd0) # remove chrs from BAM cmd1 = 'samtools view -b -L {tmp_chrsz} {bam} {res_param} > {final_bam}' cmd1 = cmd1.format(tmp_chrsz=tmp_chrsz, bam=bam, res_param=get_samtools_res_param('view', nth=nth), final_bam=final_bam) run_shell_cmd(cmd1) rm_f(tmp_chrsz) return final_bam
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) if multimapping: qname_sort_bam = samtools_name_sort(bam, nth, out_dir) cmd2 = 'samtools view -h {} | ' cmd2 += '$(which assign_multimappers.py) -k {} | ' cmd2 += 'samtools view -F 1804 -Su /dev/stdin | ' cmd2 += 'samtools sort /dev/stdin -o {} -T {} -@ {}' cmd2 = cmd2.format( qname_sort_bam, multimapping, filt_bam, prefix, nth) run_shell_cmd(cmd2) rm_f(qname_sort_bam) # remove temporary files else: cmd = 'samtools view -F 1804 -q {} -u {} | ' cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}' cmd = cmd.format( mapq_thresh, bam, filt_bam, prefix, nth) run_shell_cmd(cmd) return filt_bam
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix) fixmate_bam = '{}.fixmate.bam'.format(prefix) if multimapping: cmd1 = 'samtools view -F 524 -f 2 -u {} | ' cmd1 += 'samtools sort -n /dev/stdin -o {} -T {} -@ {} ' cmd1 = cmd1.format(bam, tmp_filt_bam, prefix, nth) run_shell_cmd(cmd1) cmd2 = 'samtools view -h {} -@ {} | ' cmd2 += '$(which assign_multimappers.py) -k {} --paired-end | ' cmd2 += 'samtools fixmate -r /dev/stdin {}' cmd2 = cmd2.format(tmp_filt_bam, nth, multimapping, fixmate_bam) run_shell_cmd(cmd2) else: cmd1 = 'samtools view -F 1804 -f 2 -q {} -u {} | ' cmd1 += 'samtools sort -n /dev/stdin -o {} -T {} -@ {}' cmd1 = cmd1.format(mapq_thresh, bam, tmp_filt_bam, prefix, nth) run_shell_cmd(cmd1) cmd2 = 'samtools fixmate -r {} {}' cmd2 = cmd2.format(tmp_filt_bam, fixmate_bam) run_shell_cmd(cmd2) rm_f(tmp_filt_bam) cmd = 'samtools view -F 1804 -f 2 -u {} | ' cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}' cmd = cmd.format(fixmate_bam, filt_bam, prefix, nth) run_shell_cmd(cmd) rm_f(fixmate_bam) return filt_bam
def mark_dup_picard(bam, out_dir, java_heap=None): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) if java_heap is None: java_heap_param = '-Xmx4G' else: java_heap_param = '-Xmx{}'.format(java_heap) run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 ' '-jar {picard} MarkDuplicates ' 'INPUT={bam} ' 'OUTPUT={dupmark_bam} ' 'METRICS_FILE={dup_qc} ' 'VALIDATION_STRINGENCY=LENIENT ' 'USE_JDK_DEFLATER=TRUE ' 'USE_JDK_INFLATER=TRUE ' 'ASSUME_SORTED=TRUE ' 'REMOVE_DUPLICATES=FALSE '.format( java_heap_param=java_heap_param, picard=locate_picard(), bam=bam, dupmark_bam=dupmark_bam, dup_qc=dup_qc, )) return dupmark_bam, dup_qc
def samtools_sort(bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) srt_bam = '{}.srt.bam'.format(prefix) cmd = 'samtools sort {} -o {} -T {} -@ {}'.format(bam, srt_bam, prefix, nth) run_shell_cmd(cmd) return srt_bam
def samstat(bam, nth=1, out_dir=''): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) samstat_qc = '{}.samstats.qc'.format(prefix) cmd = 'samtools sort -n {bam} -T {prefix}.tmp -O sam | ' cmd += 'SAMstats --sorted_sam_file - --outf {samstat_qc}' cmd = cmd.format(bam=bam, prefix=prefix, samstat_qc=samstat_qc) run_shell_cmd(cmd) return samstat_qc
def bam2ta_se(bam, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) ta = '{}.tagAlign.gz'.format(prefix) cmd = 'bedtools bamtobed -i {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format(bam, ta) run_shell_cmd(cmd) return ta
def rm_dup_pe(dupmark_bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(dupmark_bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') nodup_bam = '{}.nodup.bam'.format(prefix) cmd1 = 'samtools view -@ {} -F 1804 -f 2 -b {} > {}' cmd1 = cmd1.format(nth, dupmark_bam, nodup_bam) run_shell_cmd(cmd1) return nodup_bam
def samtools_name_sort(bam, nth=1, mem_gb=None, out_dir=''): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) nmsrt_bam = '{}.nmsrt.bam'.format(prefix) run_shell_cmd( 'samtools sort -n {bam} -o {nmsrt_bam} -T {prefix} {res_param}'.format( bam=bam, nmsrt_bam=nmsrt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) return nmsrt_bam
def remove_read_group(bam, out_dir='.'): basename = os.path.basename(strip_ext_bam(bam)) prefix = os.path.join(out_dir, basename) new_bam = '{}.no_rg.bam'.format(prefix) cmd = 'samtools view -h {} | ' cmd += 'grep -v "^@RG" | sed "s/\\tRG:Z:[^\\t]*//" | ' cmd += 'samtools view -bo {} -' cmd = cmd.format(bam, new_bam) run_shell_cmd(cmd) return new_bam
def samstat(bam, nth=1, mem_gb=None, out_dir=''): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) samstat_qc = '{}.samstats.qc'.format(prefix) run_shell_cmd( 'samtools sort -n {bam} -T {prefix}.tmp {res_param} -O sam | ' 'SAMstats --sorted_sam_file - --outf {samstat_qc}'.format( bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), samstat_qc=samstat_qc, )) return samstat_qc
def mark_dup_sambamba(bam, nth, out_dir): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc' cmd = 'sambamba markdup -t {} --hash-table-size=17592186044416 ' cmd += '--overflow-list-size=20000000 ' cmd += '--io-buffer-size=256 {} {} 2> {}' cmd = cmd.format(nth, bam, dupmark_bam, dup_qc) run_shell_cmd(cmd) return dupmark_bam, dup_qc
def rm_dup_pe(dupmark_bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(dupmark_bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') nodup_bam = '{}.nodup.bam'.format(prefix) run_shell_cmd( 'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}' .format( dupmark_bam=dupmark_bam, res_param=get_samtools_res_param('view', nth=nth), nodup_bam=nodup_bam, )) return nodup_bam
def pbc_qc_pe(bam, mito_chr_name, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbc_qc = '{}.lib_complexity.qc'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, out_dir) cmd3 = 'bedtools bamtobed -bedpe -i {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | ' cmd3 += 'grep -v "^{}\\s" | sort | uniq -c | ' cmd3 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd3 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd3 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' cmd3 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' cmd3 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"' cmd3 += ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd3 = cmd3.format(nmsrt_bam, mito_chr_name, pbc_qc) run_shell_cmd(cmd3) rm_f(nmsrt_bam) return pbc_qc
def blacklist_filter_bam(bam, blacklist, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filtered = '{}.bfilt.bam'.format(prefix) if blacklist == '' or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}' cmd = cmd.format( bam, tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp2]) return filtered
def pbc_qc_se(bam, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') pbc_qc = '{}.lib_complexity.qc'.format(prefix) cmd2 = 'bedtools bamtobed -i {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | ' cmd2 += 'grep -v "^{}\\s" | sort | uniq -c | ' cmd2 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd2 += '($1==2){{m2=m2+1}} {{m0=m0+1}} ' cmd2 += '{{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd2 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' cmd2 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' cmd2 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",' cmd2 += 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd2 = cmd2.format(bam, mito_chr_name, pbc_qc) run_shell_cmd(cmd2) return pbc_qc
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) if multimapping: qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir) run_shell_cmd( 'samtools view -h {qname_sort_bam} | ' '$(which assign_multimappers.py) -k {multimapping} | ' 'samtools view -F 1804 -Su /dev/stdin | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( qname_sort_bam=qname_sort_bam, multimapping=multimapping, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(qname_sort_bam) # remove temporary files else: run_shell_cmd( 'samtools view -F 1804 -q {mapq_thresh} -u {bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( mapq_thresh=mapq_thresh, bam=bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) return filt_bam
def main(): # read params args = parse_arguments() CHROMSIZES = args.chrsz TSS = args.tss if args.tss and os.path.basename(args.tss) != 'null' else '' FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) samtools_index(FINAL_BAM) # make an index first RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) log.info('Initializing and making output directory...') mkdir_p(args.out_dir) # Also get read length # read_len = get_read_length(FASTQ) if args.read_len_log: with open(args.read_len_log, 'r') as fp: read_len = int(fp.read().strip()) elif args.read_len: read_len = args.read_len else: read_len = None # Enrichments: V plot for enrichment # Use final to avoid duplicates tss_plot, tss_large_plot, tss_enrich_qc = \ make_tss_plot(FINAL_BAM, TSS, OUTPUT_PREFIX, CHROMSIZES, read_len) # remove temporary files rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def main(): # read params args = parse_arguments() FINAL_BAM = args.nodup_bam OUTPUT_PREFIX = os.path.join(args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) # Insert size distribution - CAN'T GET THIS FOR SE FILES insert_data, insert_plot = get_insert_distribution(RG_FREE_FINAL_BAM, OUTPUT_PREFIX) # Also need to run n-nucleosome estimation fragment_length_qc(read_picard_histogram(insert_data), OUTPUT_PREFIX) fragment_length_plot(insert_data, OUTPUT_PREFIX) rm_f(RG_FREE_FINAL_BAM) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def mark_dup_picard(bam, out_dir): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar ' cmd += locate_picard() cmd += ' MarkDuplicates ' # cmd = 'picard MarkDuplicates ' cmd += 'INPUT={} OUTPUT={} ' cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT ' cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false' cmd = cmd.format( bam, dupmark_bam, dup_qc) run_shell_cmd(cmd) return dupmark_bam, dup_qc
def pbc_qc_se(bam, mito_chr_name, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') pbc_qc = '{}.lib_complexity.qc'.format(prefix) run_shell_cmd( 'bedtools bamtobed -i {bam} | ' 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | ' 'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | ' 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' '($1==2){{m2=m2+1}} {{m0=m0+1}} ' '{{mt=mt+$1}} END{{m1_m2=-1.0; ' 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",' 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format( bam=bam, mito_chr_name=mito_chr_name, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pbc_qc=pbc_qc, )) return pbc_qc
def pbc_qc_pe(bam, mito_chr_name, nth, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbc_qc = '{}.lib_complexity.qc'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, mem_gb, out_dir) run_shell_cmd( 'bedtools bamtobed -bedpe -i {nmsrt_bam} | ' 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | ' 'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | ' 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"' ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format( nmsrt_bam=nmsrt_bam, mito_chr_name=mito_chr_name, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pbc_qc=pbc_qc, )) rm_f(nmsrt_bam) return pbc_qc
def bam_to_pbam(bam, ref_fa, out_dir='.'): '''Convert BAM into pBAM. Requirements: - Python package `ptools_bin` - `samtools` ''' prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbam_tmp = '{}.sorted.p.bam'.format(prefix) pbam = '{}.p.bam'.format(prefix) temp_files = [] if ref_fa.endswith('.gz'): gunzipped_ref_fa = '{}.fasta'.format( os.path.join(out_dir, os.path.basename(strip_ext_gz(ref_fa)))) run_shell_cmd('zcat -f {ref_fa} > {gunzipped_ref_fa}'.format( ref_fa=ref_fa, gunzipped_ref_fa=gunzipped_ref_fa, )) temp_files.append(gunzipped_ref_fa) else: gunzipped_ref_fa = ref_fa run_shell_cmd('makepBAM_genome.sh {bam} {gunzipped_ref_fa}'.format( bam=bam, gunzipped_ref_fa=gunzipped_ref_fa, )) run_shell_cmd('mv {pbam_tmp} {pbam}'.format( pbam_tmp=pbam_tmp, pbam=pbam, )) rm_f(temp_files) return pbam
def bam2ta_pe(bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) ta = '{}.tagAlign.gz'.format(prefix) # intermediate files bedpe = '{}.bedpe.gz'.format(prefix) nmsrt_bam = samtools_name_sort(bam, nth, out_dir) cmd1 = 'LC_COLLATE=C bedtools bamtobed -bedpe -mate1 -i {} | ' # cmd1 += 'sort -k1,1 -k2,2n -k3,3n | ' cmd1 += 'gzip -nc > {}' cmd1 = cmd1.format(nmsrt_bam, bedpe) run_shell_cmd(cmd1) rm_f(nmsrt_bam) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}}' cmd2 += '{{printf "%s\\t%s\\t%s\\tN\\t1000\\t%s\\n' cmd2 += '%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",' cmd2 += '$1,$2,$3,$9,$4,$5,$6,$10}}\' | ' cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format(bedpe, ta) run_shell_cmd(cmd2) rm_f(bedpe) return ta
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix) fixmate_bam = '{}.fixmate.bam'.format(prefix) if multimapping: run_shell_cmd( 'samtools view -F 524 -f 2 -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} ' .format( bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('view', nth=nth), )) run_shell_cmd( 'samtools view -h {tmp_filt_bam} | ' '$(which assign_multimappers.py) -k {multimapping} --paired-end | ' 'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format( tmp_filt_bam=tmp_filt_bam, multimapping=multimapping, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) else: run_shell_cmd( 'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}' .format( mapq_thresh=mapq_thresh, bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) run_shell_cmd( 'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'. format( tmp_filt_bam=tmp_filt_bam, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) rm_f(tmp_filt_bam) run_shell_cmd( 'samtools view -F 1804 -f 2 -u {fixmate_bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( fixmate_bam=fixmate_bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(fixmate_bam) log.info('Checking if filtered (but not deduped) BAM is empty ' 'after filtering with "samtools view -F 1804 -f 2".') if bam_is_empty(filt_bam, nth): raise ValueError( 'No reads found aftering filtering "samtools fixmate"d PE BAM with ' '"samtools view -F 1804 -f 2". ' 'Reads are not properly paired even though mapping rate is good? ') return filt_bam
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix) fixmate_bam = '{}.fixmate.bam'.format(prefix) if multimapping: run_shell_cmd( 'samtools view -F 524 -f 2 -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} ' .format( bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('view', nth=nth), )) run_shell_cmd( 'samtools view -h {tmp_filt_bam} | ' '$(which assign_multimappers.py) -k {multimapping} --paired-end | ' 'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format( tmp_filt_bam=tmp_filt_bam, multimapping=multimapping, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) else: run_shell_cmd( 'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}' .format( mapq_thresh=mapq_thresh, bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) run_shell_cmd( 'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'. format( tmp_filt_bam=tmp_filt_bam, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) rm_f(tmp_filt_bam) run_shell_cmd( 'samtools view -F 1804 -f 2 -u {fixmate_bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( fixmate_bam=fixmate_bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(fixmate_bam) return filt_bam