Esempio n. 1
0
def naive_overlap(basename_prefix, peak1, peak2, peak_pooled, peak_type,
                  nonamecheck, out_dir):
    prefix = os.path.join(out_dir, basename_prefix)
    prefix += '.overlap'
    overlap_peak = '{}.{}.gz'.format(prefix, peak_type)

    nonamecheck_param = '-nonamecheck' if nonamecheck else ''
    # narrowpeak, regionpeak only
    awk_param = '{s1=$3-$2; s2=$13-$12; '
    awk_param += 'if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'
    cut_param = '1-10'

    # due to bedtools bug when .gz is given for -a and -b
    tmp1 = gunzip(peak1, 'tmp1', out_dir)
    tmp2 = gunzip(peak2, 'tmp2', out_dir)
    tmp_pooled = gunzip(peak_pooled, 'tmp_pooled', out_dir)

    # Find pooled peaks that overlap peak1 and peak2
    # where overlap is defined as the fractional overlap
    # wrt any one of the overlapping peak pairs >= 0.5
    cmd1 = 'intersectBed {} -wo '
    cmd1 += '-a {} -b {} | '
    cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
    cmd1 += 'cut -f {} | sort | uniq | '
    cmd1 += 'intersectBed {} -wo '
    cmd1 += '-a stdin -b {} | '
    cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
    cmd1 += 'cut -f {} | sort | uniq | gzip -nc > {}'
    cmd1 = cmd1.format(
        nonamecheck_param,
        tmp_pooled,  # peak_pooled
        tmp1,  # peak1
        awk_param,
        cut_param,
        nonamecheck_param,
        tmp2,  # peak2
        awk_param,
        cut_param,
        overlap_peak)
    run_shell_cmd(cmd1)
    rm_f([tmp1, tmp2, tmp_pooled])
    return overlap_peak
Esempio n. 2
0
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, mem_gb,
                              out_dir):
    """There are pipes with multiple samtools commands.
    For such pipes, use multiple threads (-@) for only one of them.
    Priority is on sort > index > fixmate > view.
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)

    if multimapping:
        qname_sort_bam = samtools_name_sort(bam, nth, mem_gb, out_dir)

        run_shell_cmd(
            'samtools view -h {qname_sort_bam} | '
            '$(which assign_multimappers.py) -k {multimapping} | '
            'samtools view -F 1804 -Su /dev/stdin | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                qname_sort_bam=qname_sort_bam,
                multimapping=multimapping,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
        rm_f(qname_sort_bam)  # remove temporary files
    else:
        run_shell_cmd(
            'samtools view -F 1804 -q {mapq_thresh} -u {bam} | '
            'samtools sort /dev/stdin -o {filt_bam} -T {prefix} {res_param}'.
            format(
                mapq_thresh=mapq_thresh,
                bam=bam,
                filt_bam=filt_bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    return filt_bam
def idr(basename_prefix, peak1, peak2, peak_pooled, peak_type, thresh, rank,
        out_dir):
    prefix = os.path.join(out_dir, basename_prefix)
    prefix += '.idr{}'.format(thresh)
    idr_peak = '{}.{}.gz'.format(prefix, peak_type)
    idr_out_gz = '{}.unthresholded-peaks.txt.gz'.format(prefix)
    idr_plot = '{}.unthresholded-peaks.txt.png'.format(prefix)
    idr_stdout = '{}.log'.format(prefix)
    # temporary
    idr_12col_bed = '{}.12-col.bed.gz'.format(peak_type)
    idr_out = '{}.unthresholded-peaks.txt'.format(prefix)

    cmd1 = 'idr --samples {} {} --peak-list {} --input-file-type narrowPeak '
    cmd1 += '--output-file {} --rank {} --soft-idr-threshold {} '
    cmd1 += '--plot --use-best-multisummit-IDR --log-output-file {}'
    cmd1 = cmd1.format(peak1, peak2, peak_pooled, idr_out, rank, thresh,
                       idr_stdout)
    run_shell_cmd(cmd1)

    col = get_npeak_col_by_rank(rank)
    neg_log10_thresh = -math.log10(thresh)
    # LC_COLLATE=C
    cmd2 = 'awk \'BEGIN{{OFS="\\t"}} $12>={} '
    cmd2 += '{{if ($2<0) $2=0; '
    cmd2 += 'print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' {} '
    cmd2 += '| sort | uniq | sort -grk{},{} | gzip -nc > {}'
    cmd2 = cmd2.format(neg_log10_thresh, idr_out, col, col, idr_12col_bed)
    run_shell_cmd(cmd2)

    cmd3 = 'zcat {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd3 += '{{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}}\' | '
    cmd3 += 'gzip -nc > {}'
    cmd3 = cmd3.format(idr_12col_bed, idr_peak)
    run_shell_cmd(cmd3)

    cmd4 = 'gzip -f {}'.format(idr_out)
    run_shell_cmd(cmd4)

    rm_f([idr_out, idr_12col_bed])
    rm_f('{}.*.noalternatesummitpeaks.png'.format(prefix))
    return idr_peak, idr_plot, idr_out_gz, idr_stdout
def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    if subsample % 2:
        raise ValueError(
            'Number of reads to subsample should be an even number '
            'for paired end TAG-ALIGN (BED) file. n={n}'.format(n=subsample))
    ta_subsampled = '{}.{}{}{}tagAlign.gz'.format(
        prefix,
        'no_chrM.' if non_mito else '', 'R1.' if r1_only else '', '{}.'.format(
            human_readable_number(subsample)) if subsample > 0 else '')
    ta_tmp = '{}.tagAlign.tmp'.format(prefix)

    cmd0 = 'zcat -f {} | '
    if non_mito:
        # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | '
        cmd0 += 'grep -v \'^' + mito_chr_name + '\\b\' | '
    cmd0 += 'sed \'N;s/\\n/\\t/\' '
    if subsample > 0:
        cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr '
        cmd0 += '-pass pass:$(zcat -f {} | wc -c) -nosalt '
        cmd0 += '</dev/zero 2>/dev/null) > {}'
        cmd0 = cmd0.format(ta, int(subsample / 2), ta, ta_tmp)
    else:
        cmd0 += '> {}'
        cmd0 = cmd0.format(ta, ta_tmp)

    run_shell_cmd(cmd0)

    cmd = 'cat {} | '
    cmd += 'awk \'BEGIN{{OFS="\\t"}} '
    if r1_only:
        cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
        cmd += '",$1,$2,$3,$4,$5,$6}}\' | '
    else:
        cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
        cmd += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
        cmd += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd += 'gzip -nc > {}'
    cmd = cmd.format(ta_tmp, ta_subsampled)
    run_shell_cmd(cmd)
    rm_f(ta_tmp)
    return ta_subsampled
Esempio n. 5
0
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} | gzip -nc > {}'.format(cap_num_peak, npeak_tmp,
                                                  npeak)
    run_shell_cmd(cmd2)
    rm_f(npeak_tmp)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
def main():
    # read params
    args = parse_arguments()

    CHROMSIZES = args.chrsz
    TSS = args.tss if args.tss and os.path.basename(args.tss) != 'null' else ''
    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(FINAL_BAM)))
    samtools_index(FINAL_BAM)  # make an index first
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # Also get read length
    # read_len = get_read_length(FASTQ)
    if args.read_len_log:
        with open(args.read_len_log, 'r') as fp:
            read_len = int(fp.read().strip())
    elif args.read_len:
        read_len = args.read_len
    else:
        read_len = None

    # Enrichments: V plot for enrichment
    # Use final to avoid duplicates
    tss_plot, tss_large_plot, tss_enrich_qc = \
        make_tss_plot(FINAL_BAM,
                      TSS,
                      OUTPUT_PREFIX,
                      CHROMSIZES,
                      read_len)

    # remove temporary files
    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def bowtie2_se(fastq, ref_index_prefix, multimapping, local, nth, mem_gb,
               out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    run_shell_cmd(
        'bowtie2 {multimapping} {mode_param} --mm --threads {nth} -x {ref} '
        '-U {fastq} | samtools view -1 -S /dev/stdin > {tmp_bam}'.format(
            multimapping='-k {mm}'.format(mm=multimapping +
                                          1) if multimapping else '',
            mode_param='--local ' if local else '',
            nth=nth,
            ref=ref_index_prefix,
            fastq=fastq,
            tmp_bam=tmp_bam,
        ))
    bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir)
    rm_f(tmp_bam)

    return bam
Esempio n. 8
0
def run_preseq(bam_w_dups, prefix, nth=1, mem_gb=None):
    '''
    Runs preseq. Look at preseq data output to get PBC/NRF.
    '''
    # First sort because this file no longer exists...

    sort_bam = samtools_sort(bam_w_dups, nth, mem_gb)

    logging.info('Running preseq...')
    preseq_data = '{0}.preseq.dat'.format(prefix)
    preseq_log = '{0}.preseq.log'.format(prefix)

    run_shell_cmd('preseq lc_extrap -P -B -o {preseq_data} {sort_bam} '
                  '-seed 1 -v 2> {preseq_log}'.format(
                      preseq_data=preseq_data,
                      sort_bam=sort_bam,
                      preseq_log=preseq_log,
                  ))
    rm_f(sort_bam)

    return preseq_data, preseq_log
Esempio n. 9
0
def rm_unmapped_lowq_reads_se(bam, multimapping, mapq_thresh, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filt_bam = '{}.filt.bam'.format(prefix)

    if multimapping:
        qname_sort_bam = samtools_name_sort(bam, nth, out_dir)

        cmd2 = 'samtools view -h {} | '
        cmd2 += '$(which assign_multimappers.py) -k {} | '
        cmd2 += 'samtools view -F 1804 -Su /dev/stdin | '
        cmd2 += 'samtools sort /dev/stdin -o {} -T {} -@ {}'
        cmd2 = cmd2.format(qname_sort_bam, multimapping, filt_bam, prefix, nth)
        run_shell_cmd(cmd2)
        rm_f(qname_sort_bam)  # remove temporary files
    else:
        cmd = 'samtools view -F 1804 -q {} -u {} | '
        cmd += 'samtools sort /dev/stdin -o {} -T {} -@ {}'
        cmd = cmd.format(mapq_thresh, bam, filt_bam, prefix, nth)
        run_shell_cmd(cmd)

    return filt_bam
Esempio n. 10
0
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    # temporary files
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
Esempio n. 11
0
def main():
    # read params
    args = parse_arguments()

    REF = args.ref_fa
    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(FINAL_BAM)))
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)
    JAVA_HEAP = args.picard_java_heap

    gc_out, gc_plot_pdf, gc_summary = get_gc(RG_FREE_FINAL_BAM, REF,
                                             OUTPUT_PREFIX, JAVA_HEAP)
    # will generate PNG format from gc_out
    plot_gc(gc_out, OUTPUT_PREFIX)

    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def bowtie2_pe(fastq1, fastq2, ref_index_prefix, multimapping, nth, mem_gb,
               out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    run_shell_cmd(
        'bowtie2 {multimapping} -X2000 --mm --threads {nth} -x {ref} '
        '-1 {fastq1} -2 {fastq2} | samtools view -1 -S /dev/stdin > {tmp_bam}'.
        format(
            multimapping='-k {mm}'.format(mm=multimapping +
                                          1) if multimapping else '',
            nth=nth,
            ref=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            tmp_bam=tmp_bam,
        ))
    bam = samtools_sort(tmp_bam, nth, mem_gb, out_dir)
    rm_f(tmp_bam)

    return bam
def pbc_qc_pe(bam, mito_chr_name, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(bam)))
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    nmsrt_bam = samtools_name_sort(bam, nth, out_dir)
    cmd3 = 'bedtools bamtobed -bedpe -i {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | '
    cmd3 += 'grep -v "^{}\\s" | sort | uniq -c | '
    cmd3 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
    cmd3 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; '
    cmd3 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
    cmd3 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
    cmd3 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"'
    cmd3 += ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}'
    cmd3 = cmd3.format(
        nmsrt_bam,
        mito_chr_name,
        pbc_qc)
    run_shell_cmd(cmd3)
    rm_f(nmsrt_bam)
    return pbc_qc
Esempio n. 14
0
def main():
    # read params
    args = parse_arguments()

    FINAL_BAM = args.nodup_bam
    OUTPUT_PREFIX = os.path.join(args.out_dir,
                                 os.path.basename(strip_ext_bam(FINAL_BAM)))
    RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM)

    # Insert size distribution - CAN'T GET THIS FOR SE FILES
    insert_data, insert_plot = get_insert_distribution(RG_FREE_FINAL_BAM,
                                                       OUTPUT_PREFIX)
    # Also need to run n-nucleosome estimation
    fragment_length_qc(read_picard_histogram(insert_data), OUTPUT_PREFIX)
    fragment_length_plot(insert_data, OUTPUT_PREFIX)

    rm_f(RG_FREE_FINAL_BAM)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)
    half_fraglen = (fraglen + 1) / 2

    if get_num_lines(peak) == 0:
        val1 = 0.0
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools slop -i {} -g {} '
        cmd += '-s -l {} -r {} | '
        cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
        cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
        cmd += '-wa -u | wc -l'
        cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        rm_f(tmp2)
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    return frip_qc
def frip(ta, peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)

    if get_num_lines(peak) == 0:
        val1 = 0.0
        tmp_files = []
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(ta, 'tmp1', out_dir)
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
        cmd = cmd.format(
            tmp1,  # ta
            tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        tmp_files = [tmp1, tmp2]
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    rm_f(tmp_files)
    return frip_qc
Esempio n. 17
0
def spp(ta, ctl_ta, fraglen, cap_num_peak, fdr_thresh, nth, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
    basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
    if len(basename_prefix) > 200:  # UNIX cannot have filename > 255
        basename_prefix = '{}_x_control'.format(basename_ta)
    nth_param = '-p={}'.format(nth) if nth < 2 else ''
    prefix = os.path.join(out_dir, basename_prefix)
    rpeak = '{}.{}.regionPeak.gz'.format(
        prefix,
        human_readable_number(cap_num_peak))
    rpeak_tmp = '{}.tmp'.format(rpeak)
    rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak)

    cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} '
    cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}'
    cmd0 = cmd0.format(
        ta,
        ctl_ta,
        cap_num_peak,
        os.path.abspath(out_dir),
        fraglen,
        rpeak_tmp,
        fdr_thresh,
        nth_param)
    run_shell_cmd(cmd0)

    # if we have scientific representation of chr coord. then convert it to int
    cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{if ($2<0) $2=0; '
    cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' | '
    cmd1 += 'gzip -f -nc > {}'
    cmd1 = cmd1.format(
        rpeak_tmp,
        rpeak)
    run_shell_cmd(cmd1)
    rm_f([rpeak_tmp, rpeak_tmp_gz])

    return rpeak
def bwa_se(fastq, ref_index_prefix, nth, mem_gb, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq))
    prefix = os.path.join(out_dir, basename)
    tmp_bam = '{}.bam'.format(prefix)

    sai = bwa_aln(fastq, ref_index_prefix, nth, out_dir)

    run_shell_cmd(
        'bwa samse {ref} {sai} {fastq} | '
        'samtools view -bS /dev/stdin {res_param} > {tmp_bam}'.format(
            ref=ref_index_prefix,
            sai=sai,
            fastq=fastq,
            res_param=get_samtools_res_param('view', nth=nth),
            tmp_bam=tmp_bam,
        ))
    rm_f(sai)

    bam = samtools_sort(tmp_bam, nth, mem_gb)
    rm_f(tmp_bam)

    return bam
Esempio n. 19
0
def bed_clip(bed, chrsz, out_clipped_bed, no_gz=False):
    '''
    Make sure that bedClip (in USCS tools) is installed.
    Clip a BED file between 0 and chromSize (taken from 2-col chrsz file).
    bedClip exits with 255 if both start/end coordinates are
    out of valid range (0-chromSize). Otherwise, reads/peaks will be truncated.

    Args:
        no_gz:
            Do not gzip output.
    '''
    tmp_out = out_clipped_bed + '.clip_tmp'
    cmd = 'bedClip {bed} {chrsz} {tmp_out} -truncate -verbose=2'.format(
        bed=bed, chrsz=chrsz, tmp_out=out_clipped_bed if no_gz else tmp_out)
    run_shell_cmd(cmd)

    if not no_gz:
        cmd2 = 'cat {tmp_out} | gzip -nc > {out_clipped_bed}'.format(
            tmp_out=tmp_out, out_clipped_bed=out_clipped_bed)
        run_shell_cmd(cmd2)

        rm_f(tmp_out)
Esempio n. 20
0
def pbc_qc_pe(bam, mito_chr_name, nth, mem_gb, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    nmsrt_bam = samtools_name_sort(bam, nth, mem_gb, out_dir)

    run_shell_cmd(
        'bedtools bamtobed -bedpe -i {nmsrt_bam} | '
        'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | '
        'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | '
        'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
        '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; '
        'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
        'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
        'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n"'
        ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format(
            nmsrt_bam=nmsrt_bam,
            mito_chr_name=mito_chr_name,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pbc_qc=pbc_qc,
        ))
    rm_f(nmsrt_bam)
    return pbc_qc
def spr_pe(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) / 2 + 1) / 2)

    # bash-only
    cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | '
    cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr '
    cmd1 += '-pass pass:$(zcat -f {} | wc -c) '
    cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
    cmd1 += 'split -d -l {} - {}.'
    cmd1 = cmd1.format(ta, ta, nlines, prefix)
    run_shell_cmd(cmd1)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd2 += 'gzip -nc > {}'
    cmd2 = cmd2.format(tmp_pr1, ta_pr1)
    run_shell_cmd(cmd2)

    cmd3 = 'zcat -f {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd3 += 'gzip -nc > {}'
    cmd3 = cmd3.format(tmp_pr2, ta_pr2)
    run_shell_cmd(cmd3)

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
Esempio n. 22
0
def spr_se(ta, pseudoreplication_random_seed, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) + 1) / 2)

    if pseudoreplication_random_seed == 0:
        random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
        log.info(
            'Using input file\'s size {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))
    else:
        random_seed = pseudoreplication_random_seed
        log.info(
            'Using a fixed integer {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))

    # bash-only
    run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc '
                  '-aes-256-ctr -pass pass:{random_seed} '
                  '-nosalt </dev/zero 2>/dev/null) | '
                  'split -d -l {nlines} - {prefix}.'.format(
                      ta=ta,
                      random_seed=random_seed,
                      nlines=nlines,
                      prefix=prefix,
                  ))

    run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1,
                                                         ta_pr1=ta_pr1))
    run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2,
                                                         ta_pr2=ta_pr2))

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def bam_to_pbam(bam, ref_fa, out_dir='.'):
    '''Convert BAM into pBAM.

    Requirements:
        - Python package `ptools_bin`
        - `samtools`
    '''
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))

    pbam_tmp = '{}.sorted.p.bam'.format(prefix)
    pbam = '{}.p.bam'.format(prefix)

    temp_files = []

    if ref_fa.endswith('.gz'):
        gunzipped_ref_fa = '{}.fasta'.format(
            os.path.join(out_dir, os.path.basename(strip_ext_gz(ref_fa))))
        run_shell_cmd('zcat -f {ref_fa} > {gunzipped_ref_fa}'.format(
            ref_fa=ref_fa,
            gunzipped_ref_fa=gunzipped_ref_fa,
        ))
        temp_files.append(gunzipped_ref_fa)
    else:
        gunzipped_ref_fa = ref_fa

    run_shell_cmd('makepBAM_genome.sh {bam} {gunzipped_ref_fa}'.format(
        bam=bam,
        gunzipped_ref_fa=gunzipped_ref_fa,
    ))

    run_shell_cmd('mv {pbam_tmp} {pbam}'.format(
        pbam_tmp=pbam_tmp,
        pbam=pbam,
    ))
    rm_f(temp_files)

    return pbam
Esempio n. 24
0
def bam2ta_pe(bam, nth, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    ta = '{}.tagAlign.gz'.format(prefix)
    # intermediate files
    bedpe = '{}.bedpe.gz'.format(prefix)
    nmsrt_bam = samtools_name_sort(bam, nth, out_dir)

    cmd1 = 'LC_COLLATE=C bedtools bamtobed -bedpe -mate1 -i {} | '
    # cmd1 += 'sort -k1,1 -k2,2n -k3,3n | '
    cmd1 += 'gzip -nc > {}'
    cmd1 = cmd1.format(nmsrt_bam, bedpe)
    run_shell_cmd(cmd1)
    rm_f(nmsrt_bam)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd2 += '{{printf "%s\\t%s\\t%s\\tN\\t1000\\t%s\\n'
    cmd2 += '%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",'
    cmd2 += '$1,$2,$3,$9,$4,$5,$6,$10}}\' | '
    cmd2 += 'gzip -nc > {}'
    cmd2 = cmd2.format(bedpe, ta)
    run_shell_cmd(cmd2)
    rm_f(bedpe)
    return ta
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
           bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)

    log.info('Guessed read length of R1 FASTQ: {read_len}'.format(
        read_len=read_len, ))
    if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit:
        log.info('Use bwa mem.')

        cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
            extra_param='-P' if rescue_reads_for_bwa_mem else '',
            nth=nth,
            ref_index_prefix=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            sam=sam,
        )
        temp_files.append(sam)

    else:
        log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.')
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        run_shell_cmd(
            'zcat -f {sam} | grep -v -F -f {badcigar} | '
            'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                badcigar=badcigar,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
    else:
        run_shell_cmd(
            'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    rm_f(temp_files)
    return bam
Esempio n. 26
0
def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    bigbed = '{}.{}.bb'.format(prefix, peak_type)
    as_file = '{}.as'.format(prefix)
    chrsz_tmp = '{}.chrsz.tmp'.format(prefix)
    bigbed_tmp = '{}.bb.tmp'.format(prefix)
    bigbed_tmp2 = '{}.bb.tmp2'.format(prefix)

    if peak_type.lower() == 'narrowpeak' or peak_type.lower() == 'regionpeak':
        as_file_contents = '''table narrowPeak
"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
(
    string chrom;        "Reference sequence chromosome or scaffold"
    uint   chromStart;   "Start position in chromosome"
    uint   chromEnd;     "End position in chromosome"
    string name;     "Name given to a region (preferably unique). Use . if no name is assigned"
    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000) "
    char[1]  strand;     "+ or - or . for unknown"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
    int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
)
'''
        bed_param = '-type=bed6+4 -as={}'.format(as_file)
    elif peak_type.lower() == 'broadpeak':
        as_file_contents = '''table broadPeak
"BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
(
    string chrom;        "Reference sequence chromosome or scaffold"
    uint   chromStart;   "Start position in chromosome"
    uint   chromEnd;     "End position in chromosome"
    string name;     "Name given to a region (preferably unique). Use . if no name is assigned."
    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000)"
    char[1]   strand;     "+ or - or . for unknown"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
)
'''
        bed_param = '-type=bed6+3 -as={}'.format(as_file)
    elif peak_type.lower() == 'gappedpeak':
        as_file_contents = '''table gappedPeak
"This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
    (
    string chrom;   "Reference sequence chromosome or scaffold"
    uint chromStart;    "Pseudogene alignment start position"
    uint chromEnd;      "Pseudogene alignment end position"
    string name;        "Name of pseudogene"
    uint score;          "Score of pseudogene with gene (0-1000)"
    char[1] strand;     "+ or - or . for unknown"
    uint thickStart;    "Start of where display should be thick (start codon)"
    uint thickEnd;      "End of where display should be thick (stop codon)"
    uint reserved;      "Always zero for now"
    int blockCount;     "Number of blocks"
    int[blockCount] blockSizes; "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions relative to chromStart"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
)
'''
        bed_param = '-type=bed12+3 -as={}'.format(as_file)
    else:
        raise Exception('Unsupported peak file type {}!'.format(peak_type))

    # create temporary .as file
    with open(as_file, 'w') as fp:
        fp.write(as_file_contents)

    cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp)
    run_shell_cmd(cmd1)
    cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | "
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
    cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}'
    cmd2 = cmd2.format(peak, bigbed_tmp)
    run_shell_cmd(cmd2)
    cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2)
    run_shell_cmd(cmd3)
    cmd4 = "bedToBigBed {} {} {} {}".format(bed_param, bigbed_tmp2, chrsz_tmp,
                                            bigbed)
    run_shell_cmd(cmd4)

    # remove temporary files
    rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2])

    return bigbed
Esempio n. 27
0
def peak_to_hammock(peak, out_dir):
    peak_type = get_peak_type(peak)
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak)))
    hammock = '{}.{}.hammock'.format(prefix, peak_type)
    hammock_tmp = '{}.tmp'.format(hammock)
    hammock_tmp2 = '{}.tmp2'.format(hammock)
    hammock_gz = '{}.gz'.format(hammock)
    hammock_gz_tbi = '{}.gz.tbi'.format(hammock)

    if get_num_lines(peak) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz)
        run_shell_cmd(cmd)
        cmd2 = 'touch {}'.format(hammock_gz_tbi)
    else:
        cmd = "zcat -f {} | "
        cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
        cmd = cmd.format(peak, hammock_tmp)
        run_shell_cmd(cmd)

        with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout:
            id = 1
            for line in fin:
                lst = line.rstrip().split('\t')

                if peak_type == 'narrowPeak' or peak_type == 'regionPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},'
                        '{0[8]}],id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                    if lst[9] != '-1':
                        fout.write('sbstroke:[' + lst[9] + ']')
                elif peak_type == 'gappedPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},'
                        '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],'
                        'thick:['.format(lst, id))
                    a = int(lst[1])
                    sizes = lst[10].split(',')
                    starts = lst[11].split(',')
                    for i in range(len(sizes)):
                        fout.write('[{0},{1}],'.format(
                            a + int(starts[i]),
                            a + int(starts[i]) + int(sizes[i])))
                    fout.write(']},')

                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                elif peak_type == 'broadPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],'
                        'id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                else:
                    raise Exception("Unsupported peak_type {}".format(peak))
                id += 1

                fout.write('\n')

        cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
        cmd2 = cmd2.format(hammock_tmp2, hammock_gz)
        run_shell_cmd(cmd2)
        cmd3 = 'tabix -f -p bed {}'.format(hammock_gz)
        run_shell_cmd(cmd3)

        rm_f([hammock, hammock_tmp, hammock_tmp2])
    return (hammock_gz, hammock_gz_tbi)
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
Esempio n. 29
0
def main():
    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # declare temp arrays
    temp_files = []  # files to deleted later at the end

    log.info('Detecting adapters...')
    for i in range(len(args.fastqs)):
        # for each fastq to be merged later
        log.info('Detecting adapters for merge_id={}...'.format(i + 1))
        fastqs = args.fastqs[i]  # R1 and R2
        adapters = args.adapters[i]
        if args.paired_end:
            if not args.adapter and args.auto_detect_adapter and \
                    not (adapters[0] and adapters[1]):
                args.adapters[i][0] = detect_most_likely_adapter(fastqs[0])
                args.adapters[i][1] = detect_most_likely_adapter(fastqs[1])
                log.info('Detected adapters for merge_id={}, '
                         'R1: {}, R2: {}'.format(i + 1, args.adapters[i][0],
                                                 args.adapters[i][1]))
        else:
            if not args.adapter and args.auto_detect_adapter and \
                    not adapters[0]:
                args.adapters[i][0] = detect_most_likely_adapter(fastqs[0])
                log.info('Detected adapter for merge_id={}, R1: {}'.format(
                    i + 1, args.adapters[i][0]))

    log.info('Trimming adapters...')
    trimmed_fastqs_R1 = []
    trimmed_fastqs_R2 = []
    for i in range(len(args.fastqs)):
        # for each fastq to be merged later
        fastqs = args.fastqs[i]  # R1 and R2
        adapters = args.adapters[i]
        if args.paired_end:
            fastqs = trim_adapter_pe(fastqs[0], fastqs[1], adapters[0],
                                     adapters[1], args.adapter,
                                     args.cutadapt_param, args.out_dir)
            trimmed_fastqs_R1.append(fastqs[0])
            trimmed_fastqs_R2.append(fastqs[1])
        else:
            fastq = trim_adapter_se(fastqs[0], adapters[0], args.adapter,
                                    args.cutadapt_param, args.out_dir)
            trimmed_fastqs_R1.append(fastq)

    log.info('Merging fastqs...')
    log.info('R1 to be merged: {}'.format(trimmed_fastqs_R1))
    merge_fastqs(trimmed_fastqs_R1, 'R1', args.out_dir)
    if args.paired_end:
        log.info('R2 to be merged: {}'.format(trimmed_fastqs_R2))
        merge_fastqs(trimmed_fastqs_R2, 'R2', args.out_dir)

    temp_files.extend(trimmed_fastqs_R1)
    temp_files.extend(trimmed_fastqs_R2)

    log.info('Removing temporary files...')
    rm_f(temp_files)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def main():
    # filt_bam - dupmark_bam - nodup_bam
    #          \ dup_qc      \ pbc_qc

    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # declare temp arrays
    temp_files = []  # files to deleted later at the end

    log.info('Removing unmapped/low-quality reads...')
    if args.paired_end:
        filt_bam = rm_unmapped_lowq_reads_pe(args.bam, args.multimapping,
                                             args.mapq_thresh, args.nth,
                                             args.mem_gb, args.out_dir)
    else:
        filt_bam = rm_unmapped_lowq_reads_se(args.bam, args.multimapping,
                                             args.mapq_thresh, args.nth,
                                             args.mem_gb, args.out_dir)

    log.info('Checking if filtered BAM file is empty...')

    if bam_is_empty(filt_bam, args.nth):
        help_msg = (
            'No reads found in filtered BAM. '
            'Low quality sample? '
            'Or no reads passing criteria "samtools view -F 1804"? '
            'Check samtools flags at '
            'https://broadinstitute.github.io/picard/explain-flags.html. ')
        if args.paired_end:
            help_msg += (
                'Or is this truely PE BAM? '
                'All unpaired SE reads could be removed by "samtools view -f 2". '
            )
        raise ValueError(help_msg)

    log.info('Marking dupes with {}...'.format(args.dup_marker))
    if args.dup_marker == 'picard':
        dupmark_bam, dup_qc = mark_dup_picard(filt_bam, args.out_dir,
                                              args.picard_java_heap)
    elif args.dup_marker == 'sambamba':
        dupmark_bam, dup_qc = mark_dup_sambamba(filt_bam, args.nth,
                                                args.out_dir)
    else:
        raise argparse.ArgumentTypeError('Unsupported --dup-marker {}'.format(
            args.dup_marker))

    if args.no_dup_removal:
        nodup_bam = filt_bam
    else:
        temp_files.append(filt_bam)
        log.info('Removing dupes...')
        if args.paired_end:
            nodup_bam = rm_dup_pe(dupmark_bam, args.nth, args.out_dir)
        else:
            nodup_bam = rm_dup_se(dupmark_bam, args.nth, args.out_dir)
        samtools_index(dupmark_bam)
        temp_files.append(dupmark_bam + '.bai')
    temp_files.append(dupmark_bam)

    if len(args.filter_chrs) > 0:
        final_bam = remove_chrs_from_bam(nodup_bam, args.filter_chrs,
                                         args.chrsz, args.nth, args.out_dir)
        temp_files.append(nodup_bam)
    else:
        final_bam = nodup_bam

    log.info('Checking if final BAM file is empty...')
    if bam_is_empty(final_bam, args.nth):
        raise ValueError('No reads found in final (filtered/deduped) BAM. '
                         'Low quality sample? '
                         'Or BAM with duplicates only? ')

    log.info('samtools index (final_bam)...')
    samtools_index(final_bam, args.nth, args.out_dir)

    log.info('samstat...')
    samstat(final_bam, args.nth, args.mem_gb, args.out_dir)

    log.info('Generating PBC QC log...')
    if args.paired_end:
        pbc_qc_pe(dupmark_bam, args.mito_chr_name, args.nth, args.out_dir)
    else:
        pbc_qc_se(dupmark_bam, args.mito_chr_name, args.out_dir)

    log.info('samtools index (raw bam)...')
    bam = copy_f_to_dir(args.bam, args.out_dir)
    bai = samtools_index(bam, args.nth, args.out_dir)
    temp_files.extend([bam, bai])

    log.info('Removing temporary files...')
    rm_f(temp_files)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')