def main():
    # read params
    args = parse_arguments()

    REF = args.ref_fa
    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(
        args.out_dir,
        os.path.basename(strip_ext_bam(FINAL_BAM)))
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)
    JAVA_HEAP = args.picard_java_heap

    gc_out, gc_plot_pdf, gc_summary = get_gc(RG_FREE_FINAL_BAM,
                                             REF,
                                             OUTPUT_PREFIX,
                                             JAVA_HEAP)
    # will generate PNG format from gc_out
    plot_gc(gc_out, OUTPUT_PREFIX)

    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
Beispiel #2
0
def sambamba_name_sort(bam, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    nmsrt_bam = '{}.nmsrt.bam'.format(prefix)

    cmd = 'sambamba sort -n {} -o {} -t {}'.format(bam, nmsrt_bam, nth)
    run_shell_cmd(cmd)
    return nmsrt_bam
Beispiel #3
0
def main():
    # read params
    args = parse_arguments()

    ALIGNED_BAM = args.bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(ALIGNED_BAM)))
    RG_FREE_ALIGNED_BAM = remove_read_group(ALIGNED_BAM)
    JAVA_HEAP = args.picard_java_heap
    # Library complexity: Preseq results, NRF, PBC1, PBC2
    if args.paired_end:
        picard_est_lib_size = get_picard_complexity_metrics(
            RG_FREE_ALIGNED_BAM, OUTPUT_PREFIX, JAVA_HEAP)
    else:
        picard_est_lib_size = None
    preseq_data, preseq_log = run_preseq(ALIGNED_BAM,
                                         OUTPUT_PREFIX)  # SORTED BAM

    get_preseq_plot(preseq_data, OUTPUT_PREFIX)

    # write picard_est_lib_size to file
    if picard_est_lib_size is not None:
        picard_est_lib_size_file = OUTPUT_PREFIX + '.picard_est_lib_size.qc'
        with open(picard_est_lib_size_file, 'w') as fp:
            fp.write(str(picard_est_lib_size) + '\n')

    rm_f(RG_FREE_ALIGNED_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
Beispiel #4
0
def sambamba_flagstat(bam, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    flagstat_qc = '{}.flagstat.qc'.format(prefix)

    cmd = 'sambamba flagstat {} -t {} > {}'.format(bam, nth, flagstat_qc)
    run_shell_cmd(cmd)
    return flagstat_qc
Beispiel #5
0
def remove_chrs_from_bam(bam, chrs, chrsz, nth=1, out_dir=''):
    if len(chrs) == 0:
        raise ValueError('There must be at least one chromosome, zero found.')

    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    suffix = 'no_{}'.format('_'.join(chrs))
    final_bam = '{}.{}.bam'.format(prefix, suffix)
    tmp_chrsz = '{}.{}.tmp.chrsz'.format(prefix, suffix)

    # make a temp chrsz file
    cmd0 = 'zcat -f {chrsz} |'
    cmd0 += 'grep -v -P \'^({chrs})\\s\' | '
    cmd0 += 'awk \'BEGIN{{OFS="\\t"}} {{print $1,0,$2}}\' > {tmp_chrsz}'
    cmd0 = cmd0.format(chrsz=chrsz, chrs='|'.join(chrs), tmp_chrsz=tmp_chrsz)
    run_shell_cmd(cmd0)

    # remove chrs from BAM
    cmd1 = 'samtools view -b -L {tmp_chrsz} {bam} {res_param} > {final_bam}'
    cmd1 = cmd1.format(tmp_chrsz=tmp_chrsz,
                       bam=bam,
                       res_param=get_samtools_res_param('view', nth=nth),
                       final_bam=final_bam)
    run_shell_cmd(cmd1)
    rm_f(tmp_chrsz)

    return final_bam
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)

    if multimapping:
        qname_sort_bam = samtools_name_sort(bam, nth, out_dir)

        cmd2 = 'samtools view -h {} | '
        cmd2 += '$(which assign_multimappers.py) -k {} | '
        cmd2 += 'samtools view -F 1804 -Su /dev/stdin | '
        cmd2 += 'samtools sort /dev/stdin -o {} -T {} -@ {}'
        cmd2 = cmd2.format(
            qname_sort_bam,
            multimapping,
            filt_bam,
            prefix,
            nth)
        run_shell_cmd(cmd2)
        rm_f(qname_sort_bam)  # remove temporary files
    else:
        cmd = 'samtools view -F 1804 -q {} -u {} | '
        cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}'
        cmd = cmd.format(
            mapq_thresh,
            bam,
            filt_bam,
            prefix,
            nth)
        run_shell_cmd(cmd)

    return filt_bam
Beispiel #7
0
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)
    tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix)
    fixmate_bam = '{}.fixmate.bam'.format(prefix)

    if multimapping:
        cmd1 = 'samtools view -F 524 -f 2 -u {} | '
        cmd1 += 'samtools sort -n /dev/stdin -o {} -T {} -@ {} '
        cmd1 = cmd1.format(bam, tmp_filt_bam, prefix, nth)
        run_shell_cmd(cmd1)

        cmd2 = 'samtools view -h {} -@ {} | '
        cmd2 += '$(which assign_multimappers.py) -k {} --paired-end | '
        cmd2 += 'samtools fixmate -r /dev/stdin {}'
        cmd2 = cmd2.format(tmp_filt_bam, nth, multimapping, fixmate_bam)
        run_shell_cmd(cmd2)
    else:
        cmd1 = 'samtools view -F 1804 -f 2 -q {} -u {} | '
        cmd1 += 'samtools sort -n /dev/stdin -o {} -T {} -@ {}'
        cmd1 = cmd1.format(mapq_thresh, bam, tmp_filt_bam, prefix, nth)
        run_shell_cmd(cmd1)

        cmd2 = 'samtools fixmate -r {} {}'
        cmd2 = cmd2.format(tmp_filt_bam, fixmate_bam)
        run_shell_cmd(cmd2)
    rm_f(tmp_filt_bam)

    cmd = 'samtools view -F 1804 -f 2 -u {} | '
    cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}'
    cmd = cmd.format(fixmate_bam, filt_bam, prefix, nth)
    run_shell_cmd(cmd)
    rm_f(fixmate_bam)
    return filt_bam
def mark_dup_picard(bam, out_dir, java_heap=None):  # shared by both se and pe
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx4G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 '
                  '-jar {picard} MarkDuplicates '
                  'INPUT={bam} '
                  'OUTPUT={dupmark_bam} '
                  'METRICS_FILE={dup_qc} '
                  'VALIDATION_STRINGENCY=LENIENT '
                  'USE_JDK_DEFLATER=TRUE '
                  'USE_JDK_INFLATER=TRUE '
                  'ASSUME_SORTED=TRUE '
                  'REMOVE_DUPLICATES=FALSE '.format(
                      java_heap_param=java_heap_param,
                      picard=locate_picard(),
                      bam=bam,
                      dupmark_bam=dupmark_bam,
                      dup_qc=dup_qc,
                  ))
    return dupmark_bam, dup_qc
Beispiel #9
0
def samtools_sort(bam, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    srt_bam = '{}.srt.bam'.format(prefix)

    cmd = 'samtools sort {} -o {} -T {} -@ {}'.format(bam, srt_bam, prefix,
                                                      nth)
    run_shell_cmd(cmd)
    return srt_bam
Beispiel #10
0
def samstat(bam, nth=1, out_dir=''):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    samstat_qc = '{}.samstats.qc'.format(prefix)

    cmd = 'samtools sort -n {bam} -T {prefix}.tmp -O sam | '
    cmd += 'SAMstats --sorted_sam_file - --outf {samstat_qc}'
    cmd = cmd.format(bam=bam, prefix=prefix, samstat_qc=samstat_qc)
    run_shell_cmd(cmd)
    return samstat_qc
Beispiel #11
0
def bam2ta_se(bam, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    ta = '{}.tagAlign.gz'.format(prefix)

    cmd = 'bedtools bamtobed -i {} | '
    cmd += 'awk \'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}\' | '
    cmd += 'gzip -nc > {}'
    cmd = cmd.format(bam, ta)
    run_shell_cmd(cmd)
    return ta
Beispiel #12
0
def rm_dup_pe(dupmark_bam, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(dupmark_bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    nodup_bam = '{}.nodup.bam'.format(prefix)

    cmd1 = 'samtools view -@ {} -F 1804 -f 2 -b {} > {}'
    cmd1 = cmd1.format(nth, dupmark_bam, nodup_bam)
    run_shell_cmd(cmd1)
    return nodup_bam
Beispiel #13
0
def samtools_name_sort(bam, nth=1, mem_gb=None, out_dir=''):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    nmsrt_bam = '{}.nmsrt.bam'.format(prefix)

    run_shell_cmd(
        'samtools sort -n {bam} -o {nmsrt_bam} -T {prefix} {res_param}'.format(
            bam=bam,
            nmsrt_bam=nmsrt_bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
        ))
    return nmsrt_bam
Beispiel #14
0
def remove_read_group(bam, out_dir='.'):
    basename = os.path.basename(strip_ext_bam(bam))
    prefix = os.path.join(out_dir, basename)
    new_bam = '{}.no_rg.bam'.format(prefix)

    cmd = 'samtools view -h {} | '
    cmd += 'grep -v "^@RG" | sed "s/\\tRG:Z:[^\\t]*//" | '
    cmd += 'samtools view -bo {} -'
    cmd = cmd.format(bam, new_bam)
    run_shell_cmd(cmd)

    return new_bam
Beispiel #15
0
def samstat(bam, nth=1, mem_gb=None, out_dir=''):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    samstat_qc = '{}.samstats.qc'.format(prefix)

    run_shell_cmd(
        'samtools sort -n {bam} -T {prefix}.tmp {res_param} -O sam | '
        'SAMstats --sorted_sam_file - --outf {samstat_qc}'.format(
            bam=bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
            samstat_qc=samstat_qc,
        ))
    return samstat_qc
def mark_dup_sambamba(bam, nth, out_dir):  # shared by both se and pe
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'

    cmd = 'sambamba markdup -t {} --hash-table-size=17592186044416 '
    cmd += '--overflow-list-size=20000000 '
    cmd += '--io-buffer-size=256 {} {} 2> {}'
    cmd = cmd.format(nth, bam, dupmark_bam, dup_qc)
    run_shell_cmd(cmd)
    return dupmark_bam, dup_qc
def rm_dup_pe(dupmark_bam, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(dupmark_bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    nodup_bam = '{}.nodup.bam'.format(prefix)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}'
        .format(
            dupmark_bam=dupmark_bam,
            res_param=get_samtools_res_param('view', nth=nth),
            nodup_bam=nodup_bam,
        ))
    return nodup_bam
def pbc_qc_pe(bam, mito_chr_name, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    nmsrt_bam = samtools_name_sort(bam, nth, out_dir)
    cmd3 = 'bedtools bamtobed -bedpe -i {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | '
    cmd3 += 'grep -v "^{}\\s" | sort | uniq -c | '
    cmd3 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
    cmd3 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; '
    cmd3 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
    cmd3 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
    cmd3 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"'
    cmd3 += ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}'
    cmd3 = cmd3.format(nmsrt_bam, mito_chr_name, pbc_qc)
    run_shell_cmd(cmd3)
    rm_f(nmsrt_bam)
    return pbc_qc
Beispiel #19
0
def blacklist_filter_bam(bam, blacklist, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filtered = '{}.bfilt.bam'.format(prefix)

    if blacklist == '' or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}'
        cmd = cmd.format(
            bam,
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp2])
    return filtered
def pbc_qc_se(bam, mito_chr_name, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    cmd2 = 'bedtools bamtobed -i {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | '
    cmd2 += 'grep -v "^{}\\s" | sort | uniq -c | '
    cmd2 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
    cmd2 += '($1==2){{m2=m2+1}} {{m0=m0+1}} '
    cmd2 += '{{mt=mt+$1}} END{{m1_m2=-1.0; '
    cmd2 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
    cmd2 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
    cmd2 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",'
    cmd2 += 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}'
    cmd2 = cmd2.format(bam, mito_chr_name, pbc_qc)
    run_shell_cmd(cmd2)
    return pbc_qc
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)

    if multimapping:
        qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir)

        run_shell_cmd(
            'samtools view -h {qname_sort_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} | '
            'samtools view -F 1804 -Su /dev/stdin | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                qname_sort_bam=qname_sort_bam,
                multimapping=multimapping,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        rm_f(qname_sort_bam)  # remove temporary files
    else:
        run_shell_cmd(
            'samtools view -F 1804 -q {mapq_thresh} -u {bam} | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    return filt_bam
def main():
    # read params
    args = parse_arguments()

    CHROMSIZES = args.chrsz
    TSS = args.tss if args.tss and os.path.basename(args.tss) != 'null' else ''
    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(FINAL_BAM)))
    samtools_index(FINAL_BAM)  # make an index first
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # Also get read length
    # read_len = get_read_length(FASTQ)
    if args.read_len_log:
        with open(args.read_len_log, 'r') as fp:
            read_len = int(fp.read().strip())
    elif args.read_len:
        read_len = args.read_len
    else:
        read_len = None

    # Enrichments: V plot for enrichment
    # Use final to avoid duplicates
    tss_plot, tss_large_plot, tss_enrich_qc = \
        make_tss_plot(FINAL_BAM,
                      TSS,
                      OUTPUT_PREFIX,
                      CHROMSIZES,
                      read_len)

    # remove temporary files
    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
Beispiel #23
0
def main():
    # read params
    args = parse_arguments()

    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(FINAL_BAM)))
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)

    # Insert size distribution - CAN'T GET THIS FOR SE FILES
    insert_data, insert_plot = get_insert_distribution(RG_FREE_FINAL_BAM,
                                                       OUTPUT_PREFIX)
    # Also need to run n-nucleosome estimation
    fragment_length_qc(read_picard_histogram(insert_data), OUTPUT_PREFIX)
    fragment_length_plot(insert_data, OUTPUT_PREFIX)

    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def mark_dup_picard(bam, out_dir):  # shared by both se and pe
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)

    cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar '
    cmd += locate_picard()
    cmd += ' MarkDuplicates '
    # cmd = 'picard MarkDuplicates '
    cmd += 'INPUT={} OUTPUT={} '
    cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT '
    cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
    cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false'
    cmd = cmd.format(
        bam,
        dupmark_bam,
        dup_qc)
    run_shell_cmd(cmd)
    return dupmark_bam, dup_qc
Beispiel #25
0
def pbc_qc_se(bam, mito_chr_name, mem_gb, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    run_shell_cmd(
        'bedtools bamtobed -i {bam} | '
        'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | '
        'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | '
        'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
        '($1==2){{m2=m2+1}} {{m0=m0+1}} '
        '{{mt=mt+$1}} END{{m1_m2=-1.0; '
        'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
        'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
        'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",'
        'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format(
            bam=bam,
            mito_chr_name=mito_chr_name,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pbc_qc=pbc_qc,
        ))
    return pbc_qc
Beispiel #26
0
def pbc_qc_pe(bam, mito_chr_name, nth, mem_gb, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    nmsrt_bam = samtools_name_sort(bam, nth, mem_gb, out_dir)

    run_shell_cmd(
        'bedtools bamtobed -bedpe -i {nmsrt_bam} | '
        'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | '
        'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | '
        'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
        '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; '
        'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
        'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
        'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"'
        ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format(
            nmsrt_bam=nmsrt_bam,
            mito_chr_name=mito_chr_name,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pbc_qc=pbc_qc,
        ))
    rm_f(nmsrt_bam)
    return pbc_qc
def bam_to_pbam(bam, ref_fa, out_dir='.'):
    '''Convert BAM into pBAM.

    Requirements:
        - Python package `ptools_bin`
        - `samtools`
    '''
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))

    pbam_tmp = '{}.sorted.p.bam'.format(prefix)
    pbam = '{}.p.bam'.format(prefix)

    temp_files = []

    if ref_fa.endswith('.gz'):
        gunzipped_ref_fa = '{}.fasta'.format(
            os.path.join(out_dir, os.path.basename(strip_ext_gz(ref_fa))))
        run_shell_cmd('zcat -f {ref_fa} > {gunzipped_ref_fa}'.format(
            ref_fa=ref_fa,
            gunzipped_ref_fa=gunzipped_ref_fa,
        ))
        temp_files.append(gunzipped_ref_fa)
    else:
        gunzipped_ref_fa = ref_fa

    run_shell_cmd('makepBAM_genome.sh {bam} {gunzipped_ref_fa}'.format(
        bam=bam,
        gunzipped_ref_fa=gunzipped_ref_fa,
    ))

    run_shell_cmd('mv {pbam_tmp} {pbam}'.format(
        pbam_tmp=pbam_tmp,
        pbam=pbam,
    ))
    rm_f(temp_files)

    return pbam
Beispiel #28
0
def bam2ta_pe(bam, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    ta = '{}.tagAlign.gz'.format(prefix)
    # intermediate files
    bedpe = '{}.bedpe.gz'.format(prefix)
    nmsrt_bam = samtools_name_sort(bam, nth, out_dir)

    cmd1 = 'LC_COLLATE=C bedtools bamtobed -bedpe -mate1 -i {} | '
    # cmd1 += 'sort -k1,1 -k2,2n -k3,3n | '
    cmd1 += 'gzip -nc > {}'
    cmd1 = cmd1.format(nmsrt_bam, bedpe)
    run_shell_cmd(cmd1)
    rm_f(nmsrt_bam)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd2 += '{{printf "%s\\t%s\\t%s\\tN\\t1000\\t%s\\n'
    cmd2 += '%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",'
    cmd2 += '$1,$2,$3,$9,$4,$5,$6,$10}}\' | '
    cmd2 += 'gzip -nc > {}'
    cmd2 = cmd2.format(bedpe, ta)
    run_shell_cmd(cmd2)
    rm_f(bedpe)
    return ta
Beispiel #29
0
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)
    tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix)
    fixmate_bam = '{}.fixmate.bam'.format(prefix)

    if multimapping:
        run_shell_cmd(
            'samtools view -F 524 -f 2 -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} '
            .format(
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('view', nth=nth),
            ))

        run_shell_cmd(
            'samtools view -h {tmp_filt_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} --paired-end | '
            'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format(
                tmp_filt_bam=tmp_filt_bam,
                multimapping=multimapping,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))
    else:
        run_shell_cmd(
            'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}'
            .format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        run_shell_cmd(
            'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'.
            format(
                tmp_filt_bam=tmp_filt_bam,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))

    rm_f(tmp_filt_bam)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -u {fixmate_bam} | '
        'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
        format(
            fixmate_bam=fixmate_bam,
            filt_bam=filt_bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
        ))

    rm_f(fixmate_bam)

    log.info('Checking if filtered (but not deduped) BAM is empty '
             'after filtering with "samtools view -F 1804 -f 2".')
    if bam_is_empty(filt_bam, nth):
        raise ValueError(
            'No reads found aftering filtering "samtools fixmate"d PE BAM with '
            '"samtools view -F 1804 -f 2". '
            'Reads are not properly paired even though mapping rate is good? ')

    return filt_bam
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)
    tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix)
    fixmate_bam = '{}.fixmate.bam'.format(prefix)

    if multimapping:
        run_shell_cmd(
            'samtools view -F 524 -f 2 -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} '
            .format(
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('view', nth=nth),
            ))

        run_shell_cmd(
            'samtools view -h {tmp_filt_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} --paired-end | '
            'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format(
                tmp_filt_bam=tmp_filt_bam,
                multimapping=multimapping,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))
    else:
        run_shell_cmd(
            'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}'
            .format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        run_shell_cmd(
            'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'.
            format(
                tmp_filt_bam=tmp_filt_bam,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))

    rm_f(tmp_filt_bam)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -u {fixmate_bam} | '
        'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
        format(
            fixmate_bam=fixmate_bam,
            filt_bam=filt_bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
        ))

    rm_f(fixmate_bam)
    return filt_bam