Ejemplo n.º 1
0
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)

    if multimapping:
        qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir)

        run_shell_cmd(
            'samtools view -h {qname_sort_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} | '
            'samtools view -F 1804 -Su /dev/stdin | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                qname_sort_bam=qname_sort_bam,
                multimapping=multimapping,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        rm_f(qname_sort_bam)  # remove temporary files
    else:
        run_shell_cmd(
            'samtools view -F 1804 -q {mapq_thresh} -u {bam} | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    return filt_bam
Ejemplo n.º 2
0
def rm_dup_pe(dupmark_bam, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(dupmark_bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    nodup_bam = '{}.nodup.bam'.format(prefix)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}'
        .format(
            dupmark_bam=dupmark_bam,
            res_param=get_samtools_res_param('view', nth=nth),
            nodup_bam=nodup_bam,
        ))
    return nodup_bam
Ejemplo n.º 3
0
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir)

    run_shell_cmd(
        'bwa samse {ref} {sai} {fastq} | '
        'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format(
            ref=ref_index_prefix,
            sai=sai,
            fastq=fastq,
            res_param=get_samtools_res_param('view', nth=nth),
            tmp_bam=tmp_bam,
        ))
    rm_f(sai)

    bam = samtools_sort(tmp_bam, nth, mem_gb)
    rm_f(tmp_bam)

    return bam
Ejemplo n.º 4
0
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
           bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)

    log.info('Guessed read length of R1 FASTQ: {read_len}'.format(
        read_len=read_len, ))
    if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit:
        log.info('Use bwa mem.')

        cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
            extra_param='-P' if rescue_reads_for_bwa_mem else '',
            nth=nth,
            ref_index_prefix=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            sam=sam,
        )
        temp_files.append(sam)

    else:
        log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.')
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        run_shell_cmd(
            'zcat -f {sam} | grep -v -F -f {badcigar} | '
            'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                badcigar=badcigar,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
    else:
        run_shell_cmd(
            'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    rm_f(temp_files)
    return bam
Ejemplo n.º 5
0
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)
    tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix)
    fixmate_bam = '{}.fixmate.bam'.format(prefix)

    if multimapping:
        run_shell_cmd(
            'samtools view -F 524 -f 2 -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} '
            .format(
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('view', nth=nth),
            ))

        run_shell_cmd(
            'samtools view -h {tmp_filt_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} --paired-end | '
            'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format(
                tmp_filt_bam=tmp_filt_bam,
                multimapping=multimapping,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))
    else:
        run_shell_cmd(
            'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}'
            .format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        run_shell_cmd(
            'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'.
            format(
                tmp_filt_bam=tmp_filt_bam,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))

    rm_f(tmp_filt_bam)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -u {fixmate_bam} | '
        'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
        format(
            fixmate_bam=fixmate_bam,
            filt_bam=filt_bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
        ))

    rm_f(fixmate_bam)
    return filt_bam
Ejemplo n.º 6
0
def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)
    tmp_filt_bam = '{}.tmp_filt.bam'.format(prefix)
    fixmate_bam = '{}.fixmate.bam'.format(prefix)

    if multimapping:
        run_shell_cmd(
            'samtools view -F 524 -f 2 -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param} '
            .format(
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('view', nth=nth),
            ))

        run_shell_cmd(
            'samtools view -h {tmp_filt_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} --paired-end | '
            'samtools fixmate -r /dev/stdin {fixmate_bam} {res_param}'.format(
                tmp_filt_bam=tmp_filt_bam,
                multimapping=multimapping,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))
    else:
        run_shell_cmd(
            'samtools view -F 1804 -f 2 -q {mapq_thresh} -u {bam} | '
            'samtools sort -n /dev/stdin -o {tmp_filt_bam} -T {prefix} {res_param}'
            .format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                tmp_filt_bam=tmp_filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        run_shell_cmd(
            'samtools fixmate -r {tmp_filt_bam} {fixmate_bam} {res_param}'.
            format(
                tmp_filt_bam=tmp_filt_bam,
                fixmate_bam=fixmate_bam,
                res_param=get_samtools_res_param('fixmate', nth=nth),
            ))

    rm_f(tmp_filt_bam)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -u {fixmate_bam} | '
        'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
        format(
            fixmate_bam=fixmate_bam,
            filt_bam=filt_bam,
            prefix=prefix,
            res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb),
        ))

    rm_f(fixmate_bam)

    log.info('Checking if filtered (but not deduped) BAM is empty '
             'after filtering with "samtools view -F 1804 -f 2".')
    if bam_is_empty(filt_bam, nth):
        raise ValueError(
            'No reads found aftering filtering "samtools fixmate"d PE BAM with '
            '"samtools view -F 1804 -f 2". '
            'Reads are not properly paired even though mapping rate is good? ')

    return filt_bam