Esempio n. 1
0
def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    peak_ext = get_ext(peak)
    filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext)

    if get_num_lines(peak) == 0 or blacklist == '' \
            or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(peak, 'tmp1', out_dir)
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | '
        cmd += 'awk \'BEGIN{{OFS="\\t"}} '
        cmd += '{{if ($5>1000) $5=1000; print $0}}\' | '
        if not keep_irregular_chr:
            cmd += 'grep -P \'chr[\\dXY]+\\b\' | '
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            tmp1,  # peak
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp1, tmp2])
    return filtered
Esempio n. 2
0
def naive_overlap(basename_prefix, peak1, peak2, peak_pooled, peak_type,
                  nonamecheck, out_dir):
    prefix = os.path.join(out_dir, basename_prefix)
    prefix += '.overlap'
    overlap_peak = '{}.{}.gz'.format(prefix, peak_type)

    nonamecheck_param = '-nonamecheck' if nonamecheck else ''
    if peak_type.lower() in ('narrowpeak', 'regionpeak'):
        awk_param = '{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'
        cut_param = '1-10'
    elif peak_type.lower() == 'broadpeak':
        awk_param = '{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'
        cut_param = '1-9'
    elif peak_type.lower() == 'gappedpeak':
        awk_param = '{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'
        cut_param = '1-15'
    else:
        raise ValueError('Unsupported peak_type.')

    # due to bedtools bug when .gz is given for -a and -b
    tmp1 = gunzip(peak1, 'tmp1', out_dir)
    tmp2 = gunzip(peak2, 'tmp2', out_dir)
    tmp_pooled = gunzip(peak_pooled, 'tmp_pooled', out_dir)

    # Find pooled peaks that overlap peak1 and peak2
    # where overlap is defined as the fractional overlap
    # wrt any one of the overlapping peak pairs >= 0.5
    cmd1 = 'intersectBed {} -wo '
    cmd1 += '-a {} -b {} | '
    cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
    cmd1 += 'cut -f {} | sort | uniq | '
    cmd1 += 'intersectBed {} -wo '
    cmd1 += '-a stdin -b {} | '
    cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
    cmd1 += 'cut -f {} | sort | uniq | gzip -nc > {}'
    cmd1 = cmd1.format(
        nonamecheck_param,
        tmp_pooled,  # peak_pooled
        tmp1,  # peak1
        awk_param,
        cut_param,
        nonamecheck_param,
        tmp2,  # peak2
        awk_param,
        cut_param,
        overlap_peak)
    run_shell_cmd(cmd1)
    rm_f([tmp1, tmp2, tmp_pooled])
    return overlap_peak
def frip(ta, peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)

    if get_num_lines(peak) == 0:
        val1 = 0.0
        tmp_files = []
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(ta, 'tmp1', out_dir)
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
        cmd = cmd.format(
            tmp1,  # ta
            tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        tmp_files = [tmp1, tmp2]
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    rm_f(tmp_files)
    return frip_qc
Esempio n. 4
0
def blacklist_filter_bam(bam, blacklist, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filtered = '{}.bfilt.bam'.format(prefix)

    if blacklist == '' or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}'
        cmd = cmd.format(
            bam,
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp2])
    return filtered
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)
    half_fraglen = (fraglen + 1) / 2

    if get_num_lines(peak) == 0:
        val1 = 0.0
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools slop -i {} -g {} '
        cmd += '-s -l {} -r {} | '
        cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
        cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
        cmd += '-wa -u | wc -l'
        cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        rm_f(tmp2)
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    return frip_qc