def test_xcor_fraglen(self):
     with patch('common.open', mock_open(read_data='1 2')) as _:
         self.assertRaises(IndexError, common.xcor_fraglen, 'foo')
     with patch('common.open', mock_open(read_data='1 2 3')) as _:
         self.assertEquals(common.xcor_fraglen('foo'), 3)
     with patch('common.open', mock_open(read_data='1 2 c')) as _:
         self.assertRaises(ValueError, common.xcor_fraglen, 'foo')
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor,
                                 paired_end, chrom_sizes, as_file, blacklist,
                                 rep1_signal, fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info(
            "%d peaks blacklisted from the stable set" % (N1-Nsb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    r1pr_peaks_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update(
            {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1": N1,
        "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns": Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
def replicated_IDR(experiment,
                   reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
                   rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                   paired_end, chrom_sizes, as_file, blacklist,
                   rep1_signal, rep2_signal, pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads"           : rep1_n_reads,
        "rep1_frip_nreads_in_peaks"  : rep1_n_reads_in_peaks,
        "F1"            : rep1_frip_score,
        "rep2_frip_nreads"           : rep2_n_reads,
        "rep2_frip_nreads_in_peaks"  : rep2_n_reads_in_peaks,
        "F2"            : rep2_frip_score,
        "true_frip_nreads"           : true_n_reads,
        "true_frip_nreads_in_peaks"  : true_n_reads_in_peaks,
        "Ft"            : true_frip_score,
        "pr_frip_nreads"             : pr_n_reads,
        "pr_frip_nreads_in_peaks"    : pr_n_reads_in_peaks,
        "Fp"              : pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
def internal_pseudoreplicate_overlap(rep1_peaks,
                                     rep2_peaks,
                                     pooled_peaks,
                                     rep1_ta,
                                     rep1_xcor,
                                     paired_end,
                                     chrom_sizes,
                                     as_file,
                                     peak_type,
                                     prefix,
                                     fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'],
                               overlapping_peaks_fn)
    print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn,
        rep1_xcor_fn,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
def replicated_overlap(rep1_peaks,
                       rep2_peaks,
                       pooled_peaks,
                       pooledpr1_peaks,
                       pooledpr2_peaks,
                       rep1_ta,
                       rep1_xcor,
                       rep2_ta,
                       rep2_xcor,
                       paired_end,
                       chrom_sizes,
                       as_file,
                       peak_type,
                       prefix,
                       fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename,
        pool_xcor_filename,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemple #6
0
def internal_pseudoreplicate_IDR(experiment,
                                 r1pr_peaks,
                                 rep1_ta,
                                 rep1_xcor,
                                 paired_end,
                                 chrom_sizes,
                                 as_file,
                                 blacklist,
                                 rep1_signal,
                                 fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(r1pr_peaks_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1":
        N1,
        "stable_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns":
        Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
Exemple #7
0
def replicated_IDR(experiment,
                   reps_peaks,
                   r1pr_peaks,
                   r2pr_peaks,
                   pooledpr_peaks,
                   rep1_ta,
                   rep1_xcor,
                   rep2_ta,
                   rep2_xcor,
                   paired_end,
                   chrom_sizes,
                   as_file,
                   blacklist,
                   rep1_signal,
                   rep2_signal,
                   pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info("%d peaks blacklisted from the conservative set" %
                    (Nt - Ncb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No - Nob))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads": rep1_n_reads,
        "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks,
        "F1": rep1_frip_score,
        "rep2_frip_nreads": rep2_n_reads,
        "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks,
        "F2": rep2_frip_score,
        "true_frip_nreads": true_n_reads,
        "true_frip_nreads_in_peaks": true_n_reads_in_peaks,
        "Ft": true_frip_score,
        "pr_frip_nreads": pr_n_reads,
        "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks,
        "Fp": pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(reps_peaks_filename))),
            "pre_bl_optimal_set":
            dxpy.dxlink(
                dxpy.upload_local_file(
                    common.compress(peaks_to_filter_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility,
        "No":
        Nob,
        "Nc":
        Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                                     rep1_ta, rep1_xcor,
                                     paired_end, chrom_sizes, as_file,
                                     peak_type, prefix, fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn       = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe([
        'cat %s' % (overlap_tr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                       pooledpr1_peaks, pooledpr2_peaks,
                       rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                       paired_end, chrom_sizes, as_file, peak_type, prefix,
                       fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep2_ta_file         = dxpy.DXFile(rep2_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file       = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn         = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn       = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn       = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print(
        "%d peaks overlap with both pooled pseudoreplicates"
        % (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in        = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out       = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected  = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output