def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) if multimapping: qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir) run_shell_cmd( 'samtools view -h {qname_sort_bam} | ' '$(which assign_multimappers.py) -k {multimapping} | ' 'samtools view -F 1804 -Su /dev/stdin | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( qname_sort_bam=qname_sort_bam, multimapping=multimapping, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(qname_sort_bam) # remove temporary files else: run_shell_cmd( 'samtools view -F 1804 -q {mapq_thresh} -u {bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( mapq_thresh=mapq_thresh, bam=bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) return filt_bam
def rm_dup_pe(dupmark_bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(dupmark_bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') nodup_bam = '{}.nodup.bam'.format(prefix) run_shell_cmd( 'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}' .format( dupmark_bam=dupmark_bam, res_param=get_samtools_res_param('view', nth=nth), nodup_bam=nodup_bam, )) return nodup_bam
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir): basename = os.path.basename(strip_ext_fastq(fastq)) prefix = os.path.join(out_dir, basename) tmp_bam = '{}.bam'.format(prefix) sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir) run_shell_cmd( 'bwa samse {ref} {sai} {fastq} | ' 'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format( ref=ref_index_prefix, sai=sai, fastq=fastq, res_param=get_samtools_res_param('view', nth=nth), tmp_bam=tmp_bam, )) rm_f(sai) bam = samtools_sort(tmp_bam, nth, mem_gb) rm_f(tmp_bam) return bam
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe, bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) log.info('Guessed read length of R1 FASTQ: {read_len}'.format( read_len=read_len, )) if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit: log.info('Use bwa mem.') cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format( extra_param='-P' if rescue_reads_for_bwa_mem else '', nth=nth, ref_index_prefix=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, sam=sam, ) temp_files.append(sam) else: log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.') sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: run_shell_cmd( 'zcat -f {sam} | grep -v -F -f {badcigar} | ' 'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, badcigar=badcigar, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) else: run_shell_cmd( 'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(temp_files) return bam
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix) fixmate_bam = '{}.fixmate.bam'.format(prefix) if multimapping: run_shell_cmd( 'samtools view -F 524 -f 2 -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} ' .format( bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('view', nth=nth), )) run_shell_cmd( 'samtools view -h {tmp_filt_bam} | ' '$(which assign_multimappers.py) -k {multimapping} --paired-end | ' 'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format( tmp_filt_bam=tmp_filt_bam, multimapping=multimapping, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) else: run_shell_cmd( 'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}' .format( mapq_thresh=mapq_thresh, bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) run_shell_cmd( 'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'. format( tmp_filt_bam=tmp_filt_bam, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) rm_f(tmp_filt_bam) run_shell_cmd( 'samtools view -F 1804 -f 2 -u {fixmate_bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( fixmate_bam=fixmate_bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(fixmate_bam) return filt_bam
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb, out_dir): """There are pipes with multiple samtools commands. For such pipes, use multiple threads (-@) for only one of them. Priority is on sort > index > fixmate > view. """ prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filt_bam = '{}.filt.bam'.format(prefix) tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix) fixmate_bam = '{}.fixmate.bam'.format(prefix) if multimapping: run_shell_cmd( 'samtools view -F 524 -f 2 -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} ' .format( bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('view', nth=nth), )) run_shell_cmd( 'samtools view -h {tmp_filt_bam} | ' '$(which assign_multimappers.py) -k {multimapping} --paired-end | ' 'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format( tmp_filt_bam=tmp_filt_bam, multimapping=multimapping, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) else: run_shell_cmd( 'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | ' 'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}' .format( mapq_thresh=mapq_thresh, bam=bam, tmp_filt_bam=tmp_filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) run_shell_cmd( 'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'. format( tmp_filt_bam=tmp_filt_bam, fixmate_bam=fixmate_bam, res_param=get_samtools_res_param('fixmate', nth=nth), )) rm_f(tmp_filt_bam) run_shell_cmd( 'samtools view -F 1804 -f 2 -u {fixmate_bam} | ' 'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'. format( fixmate_bam=fixmate_bam, filt_bam=filt_bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(fixmate_bam) log.info('Checking if filtered (but not deduped) BAM is empty ' 'after filtering with "samtools view -F 1804 -f 2".') if bam_is_empty(filt_bam, nth): raise ValueError( 'No reads found aftering filtering "samtools fixmate"d PE BAM with ' '"samtools view -F 1804 -f 2". ' 'Reads are not properly paired even though mapping rate is good? ') return filt_bam