Beispiel #1
0
def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    peak_ext = get_ext(peak)
    filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext)

    if get_num_lines(peak) == 0 or blacklist == '' \
            or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(peak, 'tmp1', out_dir)
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | '
        cmd += 'awk \'BEGIN{{OFS="\\t"}} '
        cmd += '{{if ($5>1000) $5=1000; print $0}}\' | '
        if not keep_irregular_chr:
            cmd += 'grep -P \'chr[\\dXY]+\\b\' | '
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            tmp1,  # peak
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp1, tmp2])
    return filtered
def spr_se(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) + 1) / 2)

    # bash-only
    cmd1 = 'zcat {} | shuf --random-source=<(openssl enc '
    cmd1 += '-aes-256-ctr -pass pass:$(zcat -f {} | wc -c) '
    cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
    cmd1 += 'split -d -l {} - {}.'
    cmd1 = cmd1.format(ta, ta, nlines, prefix)
    run_shell_cmd(cmd1)

    cmd2 = 'gzip -nc {} > {}'
    cmd2 = cmd2.format(tmp_pr1, ta_pr1)
    run_shell_cmd(cmd2)

    cmd3 = 'gzip -nc {} > {}'
    cmd3 = cmd3.format(tmp_pr2, ta_pr2)
    run_shell_cmd(cmd3)

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
Beispiel #3
0
def get_num_peaks(peak_file, out_dir='.'):
    '''
    From the peak file, return number of lines in it
    '''
    basename = os.path.basename(strip_ext_peak(peak_file))
    prefix = os.path.join(out_dir, basename)
    log = '{}.num_peak.qc'.format(prefix)

    with open(log, 'w') as fp:
        fp.write(str(get_num_lines(peak_file)) + '\n')
    return log
Beispiel #4
0
def spr_pe(ta, pseudoreplication_random_seed, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) / 2 + 1) / 2)

    if pseudoreplication_random_seed == 0:
        random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
        log.info(
            'Using input file\'s size {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))
    else:
        random_seed = pseudoreplication_random_seed
        log.info(
            'Using a fixed integer {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))

    # bash-only
    run_shell_cmd('zcat -f {ta} | sed \'N;s/\\n/\\t/\' | '
                  'shuf --random-source=<(openssl enc -aes-256-ctr '
                  '-pass pass:{random_seed} -nosalt </dev/zero 2>/dev/null) | '
                  'split -d -l {nlines} - {prefix}.'.format(
                      ta=ta,
                      random_seed=random_seed,
                      nlines=nlines,
                      prefix=prefix,
                  ))

    run_shell_cmd('zcat -f {tmp_pr1} | '
                  'awk \'BEGIN{{OFS="\\t"}} '
                  '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
                  '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
                  '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
                  'gzip -nc > {ta_pr1}'.format(
                      tmp_pr1=tmp_pr1,
                      ta_pr1=ta_pr1,
                  ))

    run_shell_cmd('zcat -f {tmp_pr2} | '
                  'awk \'BEGIN{{OFS="\\t"}} '
                  '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
                  '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
                  '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
                  'gzip -nc > {ta_pr2}'.format(
                      tmp_pr2=tmp_pr2,
                      ta_pr2=ta_pr2,
                  ))

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)
    half_fraglen = (fraglen + 1) / 2

    if get_num_lines(peak) == 0:
        val1 = 0.0
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools slop -i {} -g {} '
        cmd += '-s -l {} -r {} | '
        cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
        cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
        cmd += '-wa -u | wc -l'
        cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        rm_f(tmp2)
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    return frip_qc
def frip(ta, peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)

    if get_num_lines(peak) == 0:
        val1 = 0.0
        tmp_files = []
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(ta, 'tmp1', out_dir)
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
        cmd = cmd.format(
            tmp1,  # ta
            tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        tmp_files = [tmp1, tmp2]
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    rm_f(tmp_files)
    return frip_qc
def get_fract_reads_in_regions(reads_bed, regions_bed):
    """Function that takes in bed file of reads and bed file of regions and
    gets fraction of reads sitting in said regions
    """
    # uses new run_shell_cmd
    cmd = "bedtools sort -i {}  | "
    cmd += "bedtools merge -i stdin | "
    cmd += "bedtools intersect -u -nonamecheck -a {} -b stdin | "
    cmd += "wc -l"
    cmd = cmd.format(regions_bed, reads_bed)
    intersect_read_count = int(run_shell_cmd(cmd))
    total_read_count = get_num_lines(reads_bed)
    fract_reads = float(intersect_read_count) / total_read_count

    return intersect_read_count, fract_reads
Beispiel #8
0
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, use_bwa_mem_for_pe, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)
    if use_bwa_mem_for_pe and read_len >= 70:
        cmd = 'bwa mem -M -t {} {} {} {} | gzip -nc > {}'
        cmd = cmd.format(nth, ref_index_prefix, fastq1, fastq2, sam)
        temp_files.append(sam)
    else:
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        cmd3 = 'zcat -f {} | grep -v -F -f {} | '
        cmd3 += 'samtools view -Su - | samtools sort - -o {} -T {}'
        cmd3 = cmd3.format(sam, badcigar, bam, prefix)
    else:
        cmd3 = 'samtools view -Su {} | samtools sort - -o {} -T {}'
        cmd3 = cmd3.format(sam, bam, prefix)
    run_shell_cmd(cmd3)

    rm_f(temp_files)
    return bam
Beispiel #9
0
def blacklist_filter_bam(bam, blacklist, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    filtered = '{}.bfilt.bam'.format(prefix)

    if blacklist == '' or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}'
        cmd = cmd.format(
            bam,
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp2])
    return filtered
Beispiel #10
0
def spr_se(ta, pseudoreplication_random_seed, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) + 1) / 2)

    if pseudoreplication_random_seed == 0:
        random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
        log.info(
            'Using input file\'s size {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))
    else:
        random_seed = pseudoreplication_random_seed
        log.info(
            'Using a fixed integer {random_seed} as random seed for pseudoreplication.'
            .format(random_seed=random_seed, ))

    # bash-only
    run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc '
                  '-aes-256-ctr -pass pass:{random_seed} '
                  '-nosalt </dev/zero 2>/dev/null) | '
                  'split -d -l {nlines} - {prefix}.'.format(
                      ta=ta,
                      random_seed=random_seed,
                      nlines=nlines,
                      prefix=prefix,
                  ))

    run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1,
                                                         ta_pr1=ta_pr1))
    run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2,
                                                         ta_pr2=ta_pr2))

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
def spr_pe(ta, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    tmp_pr1 = '{}.00'.format(prefix)
    tmp_pr2 = '{}.01'.format(prefix)
    ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix)
    ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
    nlines = int((get_num_lines(ta) / 2 + 1) / 2)

    # bash-only
    cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | '
    cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr '
    cmd1 += '-pass pass:$(zcat -f {} | wc -c) '
    cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
    cmd1 += 'split -d -l {} - {}.'
    cmd1 = cmd1.format(ta, ta, nlines, prefix)
    run_shell_cmd(cmd1)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd2 += 'gzip -nc > {}'
    cmd2 = cmd2.format(tmp_pr1, ta_pr1)
    run_shell_cmd(cmd2)

    cmd3 = 'zcat -f {} | '
    cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
    cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
    cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
    cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
    cmd3 += 'gzip -nc > {}'
    cmd3 = cmd3.format(tmp_pr2, ta_pr2)
    run_shell_cmd(cmd3)

    rm_f([tmp_pr1, tmp_pr2])
    return ta_pr1, ta_pr2
Beispiel #12
0
def peak_to_hammock(peak, out_dir):
    peak_type = get_peak_type(peak)
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak)))
    hammock = '{}.{}.hammock'.format(prefix, peak_type)
    hammock_tmp = '{}.tmp'.format(hammock)
    hammock_tmp2 = '{}.tmp2'.format(hammock)
    hammock_gz = '{}.gz'.format(hammock)
    hammock_gz_tbi = '{}.gz.tbi'.format(hammock)

    if get_num_lines(peak) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz)
        run_shell_cmd(cmd)
        cmd2 = 'touch {}'.format(hammock_gz_tbi)
    else:
        cmd = "zcat -f {} | "
        cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
        cmd = cmd.format(peak, hammock_tmp)
        run_shell_cmd(cmd)

        with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout:
            id = 1
            for line in fin:
                lst = line.rstrip().split('\t')

                if peak_type == 'narrowPeak' or peak_type == 'regionPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},'
                        '{0[8]}],id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                    if lst[9] != '-1':
                        fout.write('sbstroke:[' + lst[9] + ']')
                elif peak_type == 'gappedPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},'
                        '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],'
                        'thick:['.format(lst, id))
                    a = int(lst[1])
                    sizes = lst[10].split(',')
                    starts = lst[11].split(',')
                    for i in range(len(sizes)):
                        fout.write('[{0},{1}],'.format(
                            a + int(starts[i]),
                            a + int(starts[i]) + int(sizes[i])))
                    fout.write(']},')

                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                elif peak_type == 'broadPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],'
                        'id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                else:
                    raise Exception("Unsupported peak_type {}".format(peak))
                id += 1

                fout.write('\n')

        cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
        cmd2 = cmd2.format(hammock_tmp2, hammock_gz)
        run_shell_cmd(cmd2)
        cmd3 = 'tabix -f -p bed {}'.format(hammock_gz)
        run_shell_cmd(cmd3)

        rm_f([hammock, hammock_tmp, hammock_tmp2])
    return (hammock_gz, hammock_gz_tbi)
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    cmd0 = 'macs2 callpeak '
    cmd0 += '-t {} -f BED -n {} -g {} -p {} '
    cmd0 += '--shift {} --extsize {} '
    cmd0 += '--nomodel -B --SPMR '
    cmd0 += '--keep-dup all --call-summits '
    cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\
        '|| prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Reproducibility QC...')
    # description for variables
    # N: list of number of peaks in peak files from pseudo replicates
    # Nt: top number of peaks in peak files
    #     from true replicates (rep-x_vs_rep-y)
    # Np: number of peaks in peak files from pooled pseudo replicate
    N = [get_num_lines(peak) for peak in args.peaks_pr]
    if len(args.peaks):
        # multiple replicate case
        num_rep = infer_n_from_nC2(len(args.peaks))
        num_peaks_tr = [get_num_lines(peak) for peak in args.peaks]

        Nt = max(num_peaks_tr)
        Np = get_num_lines(args.peak_ppr)
        rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
        self_consistency_ratio = float(max(N)) / float(min(N))

        Nt_idx = num_peaks_tr.index(Nt)
        label_tr = infer_pair_label_from_idx(num_rep, Nt_idx)

        conservative_set = label_tr
        conservative_peak = args.peaks[Nt_idx]
        N_conservative = Nt
        if Nt > Np:
            optimal_set = conservative_set
            optimal_peak = conservative_peak
            N_optimal = N_conservative
        else:
            optimal_set = "pooled-pr1_vs_pooled-pr2"
            optimal_peak = args.peak_ppr
            N_optimal = Np
    else:
        # single replicate case
        num_rep = 1

        Nt = 0
        Np = 0
        rescue_ratio = 0.0
        self_consistency_ratio = 1.0

        conservative_set = 'rep1-pr1_vs_rep1-pr2'
        conservative_peak = args.peaks_pr[0]
        N_conservative = N[0]
        optimal_set = conservative_set
        optimal_peak = conservative_peak
        N_optimal = N_conservative

    reproducibility = 'pass'
    if rescue_ratio > 2.0 or self_consistency_ratio > 2.0:
        reproducibility = 'borderline'
    if rescue_ratio > 2.0 and self_consistency_ratio > 2.0:
        reproducibility = 'fail'

    log.info('Writing optimal/conservative peak files...')
    optimal_peak_file = os.path.join(
        args.out_dir, '{}optimal_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    conservative_peak_file = os.path.join(
        args.out_dir, '{}conservative_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    copy_f_to_f(optimal_peak, optimal_peak_file)
    copy_f_to_f(conservative_peak, conservative_peak_file)

    if args.chrsz:
        log.info('Converting peak to bigbed...')
        peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)
        peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)

        log.info('Converting peak to starch...')
        peak_to_starch(optimal_peak_file, args.out_dir)
        peak_to_starch(conservative_peak_file, args.out_dir)

        log.info('Converting peak to hammock...')
        peak_to_hammock(optimal_peak_file, args.out_dir)
        peak_to_hammock(conservative_peak_file, args.out_dir)

    log.info('Writing reproducibility QC log...')
    if args.prefix:
        reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix)
    else:
        reproducibility_qc = 'reproducibility.qc'
    reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc)

    with open(reproducibility_qc, 'w') as fp:
        header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            'Nt',
            '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]),
            'Np',
            'N_opt',
            'N_consv',
            'opt_set',
            'consv_set',
            'rescue_ratio',
            'self_consistency_ratio',
            'reproducibility',
        )
        line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative,
            optimal_set, conservative_set, rescue_ratio,
            self_consistency_ratio, reproducibility)
        fp.write(header)
        fp.write(line)

    log.info('Calculating (optimal) peak region size QC/plot...')
    region_size_qc, region_size_plot = get_region_size_metrics(
        optimal_peak_file)

    log.info('Calculating number of peaks (optimal)...')
    get_num_peaks(optimal_peak_file)

    log.info('All done.')
Beispiel #15
0
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, mem_gb,
                       out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    shiftsize = -int(round(float(smooth_win) / 2.0))
    temp_files = []

    run_shell_cmd('macs2 callpeak '
                  '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
                  '--shift {shiftsize} --extsize {extsize} '
                  '--nomodel -B --SPMR '
                  '--keep-dup all --call-summits '.format(
                      ta=ta,
                      prefix=prefix,
                      gensz=gensz,
                      pval_thresh=pval_thresh,
                      shiftsize=shiftsize,
                      extsize=smooth_win,
                  ))

    run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg '
                  '-c "{prefix}"_control_lambda.bdg '
                  '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, ))

    run_shell_cmd('bedtools slop -i "{prefix}"_FE.bdg -g {chrsz} -b 0 | '
                  'bedClip stdin {chrsz} {fc_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      fc_bedgraph=fc_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '
        '|| prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'.
        format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
               fc_bedgraph=fc_bedgraph,
               fc_bedgraph_srt=fc_bedgraph_srt))
    rm_f(fc_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format(
            fc_bedgraph_srt=fc_bedgraph_srt,
            chrsz=chrsz,
            fc_bigwig=fc_bigwig,
        ))
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg '
                  '-c "{prefix}"_control_lambda.bdg '
                  '--o-prefix {prefix} -m ppois -S {sval}'.format(
                      prefix=prefix,
                      sval=sval,
                  ))

    run_shell_cmd('bedtools slop -i "{prefix}"_ppois.bdg -g {chrsz} -b 0 | '
                  'bedClip stdin {chrsz} {pval_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      pval_bedgraph=pval_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '
        '|| prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'.
        format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pval_bedgraph=pval_bedgraph,
            pval_bedgraph_srt=pval_bedgraph_srt,
        ))
    rm_f(pval_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format(
            pval_bedgraph_srt=pval_bedgraph_srt,
            chrsz=chrsz,
            pval_bigwig=pval_bigwig,
        ))
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')

    # make out_dir (root of all outputs)
    mkdir_p(args.out_dir)

    # reproducibility QC
    log.info('Choosing appropriate control for each IP replicate...')
    num_rep = len(args.tas)
    num_ctl = len(args.ctl_tas)

    # num lines in tagaligns
    depths = [get_num_lines(ta) for ta in args.tas]
    # num lines in control tagaligns
    depths_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas]
    depth_rep_pooled = sum(depths)
    depth_ctl_pooled = sum(depths_ctl)

    # make them dicts including -1 key (meaning pooled one)
    depths = dict(enumerate(depths))
    depths_ctl = dict(enumerate(depths_ctl))

    depths[-1] = depth_rep_pooled
    depths_ctl[-1] = depth_ctl_pooled

    ctl_ta_idx = [0]*num_rep
    if num_ctl == 1:
        # if only one control, use it for all replicates
        pass
    elif args.always_use_pooled_ctl:
        # if --always-use-pooled-ctl, then always use pooled control
        ctl_ta_idx = [-1]*num_rep
    else:
        # if multiple controls,
        # check # of lines in replicate/control tagaligns and
        # apply ctl_depth_ratio

        # make depths dicts including pooled ones

        # check every num lines in every pair of control tagaligns
        # if ratio of two entries in any pair > ctl_depth_ratio then
        # use pooled control for all
        use_pooled_ctl = False
        for i in range(num_ctl):
            for j in range(i+1, num_ctl):
                if depths_ctl[i]/float(depths_ctl[j]) > \
                        args.ctl_depth_ratio or \
                        depths_ctl[j]/float(depths_ctl[i]) > \
                        args.ctl_depth_ratio:
                    use_pooled_ctl = True
                    log.info(
                        'Number of reads in controls differ by a factor of {}.'
                        'Using pooled controls.'.format(
                            args.ctl_depth_ratio))
                    break

        if use_pooled_ctl:
            # use pooled control for all exp replicates
            ctl_ta_idx = [-1]*num_rep
        else:
            for i in range(num_rep):
                if i > num_ctl-1:
                    ctl_ta_idx[i] = -1  # use pooled control
                elif depths_ctl[i] < depths[i]:
                    log.info(
                        'Fewer reads in control {} than experiment replicate '
                        '{}. Using pooled control for replicate {}.'.format(
                            i+1, i+1, i+1))
                    ctl_ta_idx[i] = -1  # use pooled control
                else:
                    ctl_ta_idx[i] = i

    ctl_ta_subsample = [0] * num_rep
    ctl_ta_subsampled_pooled = 0
    if args.exp_ctl_depth_ratio_limit or args.ctl_depth_limit:
        # subsampling chosen control for each replicate
        for rep in range(num_rep):
            chosen_ctl = ctl_ta_idx[rep]
            depth = depths[rep]
            depth_ctl = depths_ctl[chosen_ctl]
            limit = int(max(depth * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit))
            if depth_ctl > limit:
                ctl_ta_subsample[rep] = limit

        # subsampling pooled control for pooled replicate
        limit = int(max(depth_rep_pooled * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit))
        if depth_ctl_pooled > limit:
            ctl_ta_subsampled_pooled = limit

    # for each replicate check
    log.info('Writing idx.txt...')
    out_txt = os.path.join(args.out_dir, args.out_tsv_basename)
    write_txt(out_txt, ctl_ta_idx)

    log.info('Writing subsample txt...')
    out_subsample_txt = os.path.join(args.out_dir, args.out_tsv_subsample_basename)
    write_txt(out_subsample_txt, ctl_ta_subsample)

    log.info('Writing subsample_pooled txt...')
    out_subsample_pooled_txt = os.path.join(args.out_dir, args.out_txt_subsample_pooled_basename)
    write_txt(out_subsample_pooled_txt, ctl_ta_subsampled_pooled)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, ))

    run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {fc_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      fc_bedgraph=fc_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'.
        format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
               fc_bedgraph=fc_bedgraph,
               fc_bedgraph_srt=fc_bedgraph_srt))
    rm_f(fc_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format(
            fc_bedgraph_srt=fc_bedgraph_srt,
            chrsz=chrsz,
            fc_bigwig=fc_bigwig,
        ))
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix {prefix} -m ppois -S {sval}'.format(
                      prefix=prefix,
                      sval=sval,
                  ))

    run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {pval_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      pval_bedgraph=pval_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'.
        format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pval_bedgraph=pval_bedgraph,
            pval_bedgraph_srt=pval_bedgraph_srt,
        ))
    rm_f(pval_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format(
            pval_bedgraph_srt=pval_bedgraph_srt,
            chrsz=chrsz,
            pval_bigwig=pval_bigwig))
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
Beispiel #18
0
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
           bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir):
    basename = os.path.basename(strip_ext_fastq(fastq1))
    prefix = os.path.join(out_dir, basename)
    sam = '{}.sam'.format(prefix)
    badcigar = '{}.badReads'.format(prefix)
    bam = '{}.bam'.format(prefix)

    temp_files = []
    read_len = get_read_length(fastq1)

    log.info('Guessed read length of R1 FASTQ: {read_len}'.format(
        read_len=read_len, ))
    if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit:
        log.info('Use bwa mem.')

        cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
            extra_param='-P' if rescue_reads_for_bwa_mem else '',
            nth=nth,
            ref_index_prefix=ref_index_prefix,
            fastq1=fastq1,
            fastq2=fastq2,
            sam=sam,
        )
        temp_files.append(sam)

    else:
        log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.')
        sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
        sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)

        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
            ref_index_prefix, sai1, sai2, fastq1, fastq2, sam)
        temp_files.extend([sai1, sai2, sam])
    run_shell_cmd(cmd)

    cmd2 = 'zcat -f {} | '
    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
    cmd2 += 'sort | uniq > {}'
    cmd2 = cmd2.format(sam, badcigar)
    run_shell_cmd(cmd2)

    # Remove bad CIGAR read pairs
    if get_num_lines(badcigar) > 0:
        run_shell_cmd(
            'zcat -f {sam} | grep -v -F -f {badcigar} | '
            'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                badcigar=badcigar,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))
    else:
        run_shell_cmd(
            'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}'
            .format(
                sam=sam,
                bam=bam,
                prefix=prefix,
                res_param=get_samtools_res_param('sort',
                                                 nth=nth,
                                                 mem_gb=mem_gb),
            ))

    rm_f(temp_files)
    return bam
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')

    # make out_dir (root of all outputs)
    mkdir_p(args.out_dir)

    # reproducibility QC
    log.info('Choosing appropriate control for each IP replicate...')
    ctl_ta_idx = [0] * len(args.tas)
    if len(args.ctl_tas) == 1:
        # if only one control, use it for all replicates
        pass
    elif args.always_use_pooled_ctl:
        # if --always-use-pooled-ctl, then always use pooled control
        ctl_ta_idx = [-1] * len(args.tas)
    else:
        # if multiple controls,
        # check # of lines in replicate/control tagaligns and
        # apply ctl_depth_ratio

        # num lines in tagaligns
        nlines = [get_num_lines(ta) for ta in args.tas]
        # num lines in control tagaligns
        nlines_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas]

        # check every num lines in every pair of control tagaligns
        # if ratio of two entries in any pair > ctl_depth_ratio then
        # use pooled control for all
        use_pooled_ctl = False
        for i in range(len(nlines_ctl)):
            for j in range(i + 1, len(nlines_ctl)):
                if nlines_ctl[i]/float(nlines_ctl[j]) > \
                        args.ctl_depth_ratio or \
                        nlines_ctl[j]/float(nlines_ctl[i]) > \
                        args.ctl_depth_ratio:
                    use_pooled_ctl = True
                    log.info(
                        'Number of reads in controls differ by a factor of {}.'
                        'Using pooled controls.'.format(args.ctl_depth_ratio))
                    break

        if use_pooled_ctl:
            # use pooled control for all exp replicates
            ctl_ta_idx = [-1] * len(args.tas)
        else:
            for i in range(len(args.tas)):
                if i > len(args.ctl_tas) - 1:
                    ctl_ta_idx[i] = -1  # use pooled control
                elif nlines_ctl[i] < nlines[i]:
                    log.info(
                        'Fewer reads in control {} than experiment replicate '
                        '{}. Using pooled control for replicate {}.'.format(
                            i + 1, i + 1, i + 1))
                    ctl_ta_idx[i] = -1  # use pooled control
                else:
                    ctl_ta_idx[i] = i

    # log.info('Writing idx.txt...')
    # out_txt = os.path.join(args.out_dir, 'idx.txt')
    # write_txt(out_txt, ctl_ta_idx)
    log.info('Writing ctl_for_repN.tagAlign.gz files...')
    for i, ctl_id in enumerate(ctl_ta_idx):
        rep_id = i + 1
        dest = os.path.join(args.out_dir,
                            'ctl_for_rep{}.tagAlign.gz'.format(rep_id))
        if ctl_id == -1:
            src = args.ctl_ta_pooled[0]
        else:
            src = args.ctl_tas[ctl_id]
        copy_f_to_f(src, dest)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')