コード例 #1
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
    return_string = \
        "\t\ttrack %s%d\n" %(accession,n) + \
        "\t\tbigDataUrl %s\n" %(url) + \
        "\t\tshortLabel %s\n" %(name[:17]) + \
        "\t\tparent %sviewpeaks on\n" %(accession) + \
        "\t\ttype %s\n" %(tracktype) + \
        "\t\tvisibility dense\n" + \
        "\t\tview PK\n" + \
        "\t\tpriority %d\n\n" %(n)
    n_stanzas = 1
    if not lowpass:
        lowpass = []
    if isinstance(lowpass,int):
        lowpass = [lowpass]
    extra_stanza_count = 0
    for (i, cutoff) in enumerate(lowpass,start=1):
        fn = dx.get_id()
        if not os.path.isfile(fn):
            dxpy.download_dxfile(dx.get_id(),fn)
        cutoffstr = '-lt%d' %(cutoff)
        outfn = fn + cutoffstr
        print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
        bed_fn = fn + '.bed'
        common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
        common.run_pipe([
            'cat %s' %(bed_fn),
            r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
        print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
        if tracktype =='bigBed 6 +':
            as_file = 'narrowPeak.as'
        elif tracktype == 'bigBed 12 +':
            as_file = 'gappedPeak.as'
        else:
            print "Cannot match tracktype %s to any .as file" %(tracktype)
        bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
        newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
        new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

        new_lines = [
            "\t\ttrack %s%d" %(accession,n+i),
            "\t\tbigDataUrl %s" %(new_url),
            "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
            "\t\tparent %sviewpeaks on" %(accession),
            "\t\ttype %s" %(tracktype),
            "\t\tvisibility dense",
            "\t\tview PK",
            "\t\tpriority %d\n\n" %(n+i)]
        new_stanza = '\n'.join(new_lines)
        return_string += new_stanza
        n_stanzas += 1
        os.remove(bed_fn)
        os.remove(bb_fn)
        os.remove(outfn)
        os.remove(fn)

    return(return_string, n_stanzas)
コード例 #2
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
	return_string = \
		"\t\ttrack %s%d\n" %(accession,n) + \
		"\t\tbigDataUrl %s\n" %(url) + \
		"\t\tshortLabel %s\n" %(name[:17]) + \
		"\t\tparent %sviewpeaks on\n" %(accession) + \
		"\t\ttype %s\n" %(tracktype) + \
		"\t\tvisibility dense\n" + \
		"\t\tview PK\n" + \
		"\t\tpriority %d\n\n" %(n)
	n_stanzas = 1
	if not lowpass:
		lowpass = []
	if isinstance(lowpass,int):
		lowpass = [lowpass]
	extra_stanza_count = 0
	for (i, cutoff) in enumerate(lowpass,start=1):
		fn = dx.get_id()
		if not os.path.isfile(fn):
			dxpy.download_dxfile(dx.get_id(),fn)
		cutoffstr = '-lt%d' %(cutoff)
		outfn = fn + cutoffstr
		print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
		bed_fn = fn + '.bed'
		common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
		common.run_pipe([
			'cat %s' %(bed_fn),
			r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
		print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
		if tracktype =='bigBed 6 +':
			as_file = 'narrowPeak.as'
		elif tracktype == 'bigBed 12 +':
			as_file = 'gappedPeak.as'
		else:
			print "Cannot match tracktype %s to any .as file" %(tracktype)
		bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
		newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
		new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

		new_lines = [
			"\t\ttrack %s%dp%d" %(accession,n,i),
			"\t\tbigDataUrl %s" %(new_url),
			"\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
			"\t\tparent %sviewpeaks on" %(accession),
			"\t\ttype %s" %(tracktype),
			"\t\tvisibility dense",
			"\t\tview PK",
			"\t\tpriority %d.%d\n\n" %(n,i)]
		new_stanza = '\n'.join(new_lines)
		return_string += new_stanza
		n_stanzas += 1

	return(return_string, n_stanzas)
コード例 #3
0
ファイル: spp.py プロジェクト: weng-lab/chip-seq-pipeline
	def upload(self, uploader):

		# Information about called peaks
		n_spp_peaks = common.count_lines(self.peaks_fn)
		print "%s peaks called by spp" % n_spp_peaks
		print "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(self.fixed_peaks_fn))
		print "First 50 peaks"
		print subprocess.check_output('head -50 %s' % self.fixed_peaks_fn, shell=True, stderr=subprocess.STDOUT)

		# Upload bigBed if applicable
		if self.bigbed:
			self.peaks_bb_fn = common.bed2bb(self.fixed_peaks_fn, self.chrom_sizes.name, self.as_file.name)
			if self.peaks_bb_fn:
				self.peaks_bb = uploader.upload(self.peaks_bb_fn)

		if not filecmp.cmp(self.peaks_fn, self.fixed_peaks_fn): print "Returning peaks with fixed coordinates"

		# Upload peaks
		print subprocess.check_output(shlex.split("gzip %s" % self.fixed_peaks_fn))
		self.peaks = uploader.upload(self.fixed_peaks_fn + ".gz")

		# Upload cross-correlations
                self.xcor_plot   = uploader.upload(self.xcor_plot)
	        self.xcor_scores = uploader.upload(self.xcor_scores)
コード例 #4
0
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor,
                                 paired_end, chrom_sizes, as_file, blacklist,
                                 rep1_signal, fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info(
            "%d peaks blacklisted from the stable set" % (N1-Nsb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    r1pr_peaks_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update(
            {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1": N1,
        "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns": Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
コード例 #5
0
def replicated_IDR(experiment,
                   reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
                   rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                   paired_end, chrom_sizes, as_file, blacklist,
                   rep1_signal, rep2_signal, pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads"           : rep1_n_reads,
        "rep1_frip_nreads_in_peaks"  : rep1_n_reads_in_peaks,
        "F1"            : rep1_frip_score,
        "rep2_frip_nreads"           : rep2_n_reads,
        "rep2_frip_nreads_in_peaks"  : rep2_n_reads_in_peaks,
        "F2"            : rep2_frip_score,
        "true_frip_nreads"           : true_n_reads,
        "true_frip_nreads_in_peaks"  : true_n_reads_in_peaks,
        "Ft"            : true_frip_score,
        "pr_frip_nreads"             : pr_n_reads,
        "pr_frip_nreads_in_peaks"    : pr_n_reads_in_peaks,
        "Fp"              : pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #6
0
def main(rep1_peaks,
         rep2_peaks,
         pooled_peaks,
         pooledpr1_peaks,
         pooledpr2_peaks,
         chrom_sizes,
         as_file,
         peak_type,
         prefix=None,
         rep1_signal=None,
         rep2_signal=None,
         pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks = dxpy.DXFile(rep1_peaks)
    rep2_peaks = dxpy.DXFile(rep2_peaks)
    pooled_peaks = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    as_file = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)  #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)
    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" % (
        common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" % (
        common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" % (
        common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))

    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        'npeaks_rejected': npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #7
0
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                                     rep1_ta, rep1_xcor,
                                     paired_end, chrom_sizes, as_file,
                                     peak_type, prefix, fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn       = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe([
        'cat %s' % (overlap_tr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #8
0
        def process(self):
                '''
                #find pooled peaks that are in (rep1 AND rep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
                        ], overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

                #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
                        ], overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

                #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
                        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
                        ], overlapping_peaks_fn)
                print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
                '''
                #the only difference between the peak_types is how the extra columns are handled
                if self.peak_type == "narrowPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-10'
                        bed_type = 'bed6+4'
                elif self.peak_type == "gappedPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-15'
                        bed_type = 'bed12+3'
                elif self.peak_type == "broadPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-9'
                        bed_type = 'bed6+3'
                else:
                        print "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak."
                        sys.exit()

                # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn))

                # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn))

                # Combine peak lists
                out, err = common.run_pipe([
                        'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn),
                        'sort -u'
                        ], self.overlapping_peaks_fn)
                print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn))

                #rejected peaks
                out, err = common.run_pipe([
                        'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn)
                        ], self.rejected_peaks_fn)
                print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn))

                self.npeaks_in 		= common.count_lines(common.uncompress(self.pooled_peaks_fn))
                self.npeaks_out 		= common.count_lines(self.overlapping_peaks_fn)
                self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn)

                #make bigBed files for visualization
                self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
                self.rejected_peaks_bb_fn 	= common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
コード例 #9
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3 # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        print "Read fragment length: %d" %(fragment_length)

    #run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz'
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    #install spp
    subprocess.check_call('ls -l', shell=True)
    subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball)))
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)
    print spp_command
    # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # for line in iter(process.stdout.readline, ''):
    #     sys.stdout.write(line)
    subprocess.check_call(shlex.split(spp_command))

    #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
    #this changes any such coodinates to decimal notation
    #this assumes 10-column output and that the 2nd and 3rd columns are coordinates
    #slopBed adjusts feature end coordinates that go off the end of the chromosome
    #bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" %(final_peaks_filename),
        "tee %s" %(peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename),
        'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    #These lines transfer the peaks files to the temporary workspace for debugging later
    #Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" %(n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))
    print "First 50 peaks"
    print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #10
0
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks,
         chrom_sizes, as_file, peak_type, prefix=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks      = dxpy.DXFile(rep1_peaks)
    rep2_peaks      = dxpy.DXFile(rep2_peaks)
    pooled_peaks    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes     = dxpy.DXFile(chrom_sizes)
    as_file         = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn       = 'rep1-%s' %(rep1_peaks.name)
    rep2_peaks_fn       = 'rep2-%s' %(rep2_peaks.name)
    pooled_peaks_fn     = 'pooled-%s' %(pooled_peaks.name)
    pooledpr1_peaks_fn  = 'pooledpr1-%s' %(pooledpr1_peaks.name)
    pooledpr2_peaks_fn  = 'pooledpr2-%s' %(pooledpr2_peaks.name)
    chrom_sizes_fn      = 'chrom.sizes'
    as_file_fn          = '%s.as' %(peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' %(basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' %(basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn   = 'replicated_tr.%s' %(peak_type)
    overlap_pr_fn   = 'replicated_pr.%s' %(peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)

    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' %(overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn))

    npeaks_in       = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out      = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn    = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        'npeaks_rejected'       : npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #11
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

        # The following line(s) download your file inputs to the local file system
        # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign")
    peaks_filename = output_filename_prefix + ".regionPeak"
    final_peaks_filename = peaks_filename + ".gz"  # spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + ".pdf"
    xcor_scores_filename = output_filename_prefix + ".ccscores"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3  # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, "r") as f:
        line = f.readline()
        fragment_length = int(line.split("\t")[fraglen_column - 1])
        print "Read fragment length: %d" % (fragment_length)

        # run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz"
    if nodups:
        run_spp = "/phantompeakqualtools/run_spp_nodups.R"
    else:
        run_spp = "/phantompeakqualtools/run_spp.R"
        # install spp
    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT)
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (
        run_spp,
        cpu_count(),
        experiment_filename,
        control_filename,
        npeaks,
        fragment_length,
        peaks_filename,
        xcor_plot_filename,
        xcor_scores_filename,
    )
    print spp_command
    process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    for line in iter(process.stdout.readline, ""):
        sys.stdout.write(line)

        # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
        # this changes any such coodinates to decimal notation
        # this assumes 10-column output and that the 2nd and 3rd columns are coordinates
        # slopBed adjusts feature end coordinates that go off the end of the chromosome
        # bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak"

    out, err = common.run_pipe(
        [
            "gzip -dc %s" % (final_peaks_filename),
            "tee %s" % (peaks_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
            "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename),
            "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename),
        ]
    )

    # These lines transfer the peaks files to the temporary workspace for debugging later
    # Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" % (n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" % (
        n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)
    )
    print "First 50 peaks"
    print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + ".gz"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #12
0
def internal_pseudoreplicate_IDR(experiment,
                                 r1pr_peaks,
                                 rep1_ta,
                                 rep1_xcor,
                                 paired_end,
                                 chrom_sizes,
                                 as_file,
                                 blacklist,
                                 rep1_signal,
                                 fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(r1pr_peaks_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1":
        N1,
        "stable_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns":
        Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
コード例 #13
0
def replicated_IDR(experiment,
                   reps_peaks,
                   r1pr_peaks,
                   r2pr_peaks,
                   pooledpr_peaks,
                   rep1_ta,
                   rep1_xcor,
                   rep2_ta,
                   rep2_xcor,
                   paired_end,
                   chrom_sizes,
                   as_file,
                   blacklist,
                   rep1_signal,
                   rep2_signal,
                   pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info("%d peaks blacklisted from the conservative set" %
                    (Nt - Ncb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No - Nob))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads": rep1_n_reads,
        "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks,
        "F1": rep1_frip_score,
        "rep2_frip_nreads": rep2_n_reads,
        "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks,
        "F2": rep2_frip_score,
        "true_frip_nreads": true_n_reads,
        "true_frip_nreads_in_peaks": true_n_reads_in_peaks,
        "Ft": true_frip_score,
        "pr_frip_nreads": pr_n_reads,
        "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks,
        "Fp": pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(reps_peaks_filename))),
            "pre_bl_optimal_set":
            dxpy.dxlink(
                dxpy.upload_local_file(
                    common.compress(peaks_to_filter_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility,
        "No":
        Nob,
        "Nc":
        Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #14
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed,
         chrom_sizes, spp_version, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(
        xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(subprocess.check_output(
        'ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    fraglen_column = 3
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        logger.info("Read fragment length: %d" % (fragment_length))

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fragment_length, peaks_filename, xcor_plot_filename,
           xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates"
        % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output(
        'head -50 %s' % (fix_coordinate_peaks_filename),
        shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #15
0
ファイル: spp.py プロジェクト: zozo123/chip-seq-pipeline
def main(experiment,
         control,
         xcor_scores_input,
         npeaks,
         nodups,
         bigbed,
         chrom_sizes,
         spp_version,
         as_file=None,
         prefix=None,
         fragment_length=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(),
                         xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(
        subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    # if fragment_length is provided, use that. Else read
    # fragment length from xcor file
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % (fraglen))
    else:
        fraglen_column = 3
        with open(xcor_scores_input_filename, 'r') as f:
            line = f.readline()
            fraglen = line.split('\t')[fraglen_column - 1]
            logger.info("Read fragment length: %s" % (fraglen))

    # spp_tarball = SPP_VERSION_MAP.get(spp_version)
    # assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    run_spp = '/phantompeakqualtools/run_spp.R'
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' %
        (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates" %
        (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename),
                            shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(
            shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #16
0
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
         chrom_sizes, as_file, blacklist=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    subprocess.check_output('set -x; ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    logging.info("Exiting with output: %s", output)
    return output
コード例 #17
0
def main(experiment,
         reps_peaks,
         r1pr_peaks,
         r2pr_peaks,
         pooledpr_peaks,
         chrom_sizes,
         as_file,
         blacklist=None):

    #TODO for now just taking the peak files.  This applet should actually call IDR instead of
    #putting that in the workflow populator script

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Download the file inputs to the local file system.

    #Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    print subprocess.check_output('ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    print "%d peaks from true replicates" % (Nt)
    N1 = common.count_lines(r1pr_peaks_filename)
    print "%d peaks from rep1 self-pseudoreplicates" % (N1)
    N2 = common.count_lines(r2pr_peaks_filename)
    print "%d peaks from rep2 self-pseudoreplicates" % (N2)
    Np = common.count_lines(pooledpr_peaks_filename)
    print "%d peaks from pooled pseudoreplicates" % (Np)

    conservative_set_filename = '%s_final_conservative.narrowPeak' % (
        experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
    else:
        conservative_set_filename = reps_peaks_filename
    Ncb = common.count_lines(conservative_set_filename)
    print "%d peaks blacklisted from the conservative set" % (Nt - Ncb)

    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np

    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
    else:
        optimal_set_filename = peaks_to_filter_filename
    Nob = common.count_lines(optimal_set_filename)
    print "%d peaks blacklisted from the optimal set" % (No - Nob)

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    #bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = common.bed2bb(conservative_set_filename,
                                                 chrom_sizes_filename,
                                                 as_file_filename)
    optimal_set_bb_filename = common.bed2bb(optimal_set_filename,
                                            chrom_sizes_filename,
                                            as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = dxpy.upload_local_file(
            conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility
    })

    logging.info("Exiting with output: %s", output)
    return output
コード例 #18
0
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None):

	#TODO for now just taking the peak files.  This applet should actually call IDR instead of 
	#putting that in the workflow populator script

	# Initialize the data object inputs on the platform into
	# dxpy.DXDataObject instances.

	reps_peaks_file = dxpy.DXFile(reps_peaks)
	r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
	r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
	pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
	chrom_sizes_file = dxpy.DXFile(chrom_sizes)
	as_file_file = dxpy.DXFile(as_file)
	if blacklist is not None:
		blacklist_file = dxpy.DXFile(blacklist)
		blacklist_filename = 'blacklist_%s' %(blacklist_file.name)
		dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
		blacklist_filename = common.uncompress(blacklist_filename)

	# Download the file inputs to the local file system.

	#Need to prepend something to ensure the local filenames will be unique
	reps_peaks_filename = 'true_%s' %(reps_peaks_file.name)
	r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name)
	r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name)
	pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name)
	chrom_sizes_filename = chrom_sizes_file.name
	as_file_filename = as_file_file.name

	dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
	dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
	dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
	dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
	dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
	dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

	print subprocess.check_output('ls -l', shell=True)

	reps_peaks_filename = common.uncompress(reps_peaks_filename)
	r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
	r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
	pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

	Nt = common.count_lines(reps_peaks_filename)
	print "%d peaks from true replicates" %(Nt)
	N1 = common.count_lines(r1pr_peaks_filename)
	print "%d peaks from rep1 self-pseudoreplicates" %(N1)
	N2 = common.count_lines(r2pr_peaks_filename)
	print "%d peaks from rep2 self-pseudoreplicates" %(N2)
	Np = common.count_lines(pooledpr_peaks_filename)
	print "%d peaks from pooled pseudoreplicates" %(Np)

	conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename)
	else:
		conservative_set_filename = reps_peaks_filename
	Ncb = common.count_lines(conservative_set_filename)
	print "%d peaks blacklisted from the conservative set" %(Nt-Ncb)

	if Nt >= Np:
		peaks_to_filter_filename = reps_peaks_filename
		No = Nt
	else:
		peaks_to_filter_filename = pooledpr_peaks_filename
		No = Np

	optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename)
	else:
		optimal_set_filename = peaks_to_filter_filename
	Nob = common.count_lines(optimal_set_filename)
	print "%d peaks blacklisted from the optimal set" %(No-Nob)

	rescue_ratio            = float(max(Np,Nt)) / float(min(Np,Nt))
	self_consistency_ratio  = float(max(N1,N2)) / float(min(N1,N2))

	if rescue_ratio > 2 and self_consistency_ratio > 2:
		reproducibility = 'fail'
	elif rescue_ratio > 2 or self_consistency_ratio > 2:
		reproducibility = 'borderline'
	else:
		reproducibility = 'pass'

	output = {}

	#bedtobigbed often fails, so skip creating the bb if it does
	conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename)
	optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename)
	if conservative_set_bb_filename:
		conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename)
		output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
	if optimal_set_bb_filename:
		optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
		output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

	output.update({
		"Nt": Nt,
		"N1": N1,
		"N2": N2,
		"Np": Np,
		"conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
		"optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
		"rescue_ratio": rescue_ratio,
		"self_consistency_ratio": self_consistency_ratio,
		"reproducibility_test": reproducibility,
		"No": Nob,
		"Nc": Ncb
	})

	logging.info("Exiting with output: %s", output)
	return output
コード例 #19
0
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as,
         gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment = dxpy.DXFile(experiment)
    control = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    narrowPeak_as = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as = dxpy.DXFile(gappedpeak_as)
    broadPeak_as = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(), experiment.name)
    dxpy.download_dxfile(control.get_id(), control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    broadPeak_bb_fn = "%s.bb" % (broadPeak_fn)
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name, 'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2]  #third column
        print "Fraglen %s" % (fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' %
                                                  (peaks_dirname, prefix),
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
        '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads,
                                                            controlReads, sval)

    returncode = common.block_on(
     'macs2 bdgcmp ' + \
     '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
     '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
     '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
     '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes.name,
                                        narrowPeak_as.name,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes.name,
                                        gappedPeak_as.name,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes.name,
                                       broadPeak_as.name,
                                       bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit
    for fn in [
            narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn,
            gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn
    ]:
        common.block_on('touch %s' % (fn))

    # Upload the file outputs

    narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks": dxpy.dxlink(narrowPeak),
        "gappedpeaks": dxpy.dxlink(gappedPeak),
        "broadpeaks": dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb": dxpy.dxlink(broadPeak_bb),
        "fc_signal": dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
コード例 #20
0
def replicated_overlap(rep1_peaks,
                       rep2_peaks,
                       pooled_peaks,
                       pooledpr1_peaks,
                       pooledpr2_peaks,
                       rep1_ta,
                       rep1_xcor,
                       rep2_ta,
                       rep2_xcor,
                       paired_end,
                       chrom_sizes,
                       as_file,
                       peak_type,
                       prefix,
                       fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename,
        pool_xcor_filename,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #21
0
def main(experiment,
         control,
         xcor_scores_input,
         chrom_sizes,
         narrowpeak_as,
         gappedpeak_as,
         broadpeak_as,
         genomesize,
         prefix=None,
         fragment_length=None):

    narrowPeak_as = narrowpeak_as
    gappedPeak_as = gappedpeak_as
    broadPeak_as = broadpeak_as

    # Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    if not prefix:
        prefix = experiment
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file
    # if the fragment_length argument is given, use that instead
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % fraglen)
    else:
        with open(xcor_scores_input, 'r') as fh:
            firstline = fh.readline()
            fraglen = firstline.split()[2]  # third column
            logger.info("Fraglen %s" % (fraglen))

    # ===========================================
    # Generate narrow peaks and preliminary signal tracks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip(
        '%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn,
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4
    # with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # Generate Broad and Gapped Peaks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip(
        '%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn,
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending
    # order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %
                                              (peaks_dirname, prefix),
                                              chrom_sizes,
                                              bed_type='gappedPeak')

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn,
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # For Fold enrichment signal tracks
    # ============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
    # Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' % (peaks_dirname, prefix) + \
              '-m FE'
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (fc_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    # drm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    # ===========================================
    # For -log10(p-value) signal tracks
    # ============================================

    # Compute sval =
    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
    out, err = common.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    logger.info("chipReads = %s, controlReads = %s, sval = %s" %
                (chipReads, controlReads, sval))

    returncode = common.block_on('macs2 bdgcmp ' +
                                 '-t %s/%s_treat_pileup.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '-c %s/%s_control_lambda.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '--outdir %s -o %s_ppois.bdg ' %
                                 (peaks_dirname, prefix) + '-m ppois -S %s' %
                                 (sval))
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (pvalue_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    # ===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    # ============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes,
                                        narrowPeak_as,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes,
                                        gappedPeak_as,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes,
                                       broadPeak_as,
                                       bed_type='bed6+3')

    # Temporary during development to create empty files just to get the applet
    # to exit
    # narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    # gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    # broadPeak_bb_fn  = "%s.bb" % (broadPeak_fn)

    output = {
        "narrowpeaks": narrowPeak_gz_fn,
        "gappedpeaks": gappedPeak_gz_fn,
        "broadpeaks": broadPeak_gz_fn,
        "narrowpeaks_bb": narrowPeak_bb_fname,
        "gappedpeaks_bb": gappedPeak_bb_fname,
        "broadpeaks_bb": broadPeak_bb_fname,
        "fc_signal": fc_signal_fn,
        "pvalue_signal": pvalue_signal_fn
    }

    return output
コード例 #22
0
def internal_pseudoreplicate_overlap(rep1_peaks,
                                     rep2_peaks,
                                     pooled_peaks,
                                     rep1_ta,
                                     rep1_xcor,
                                     paired_end,
                                     chrom_sizes,
                                     as_file,
                                     peak_type,
                                     prefix,
                                     fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'],
                               overlapping_peaks_fn)
    print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn,
        rep1_xcor_fn,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #23
0
ファイル: macs2.py プロジェクト: jmrinaldi/chip-seq-pipeline
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment        = dxpy.DXFile(experiment)
    control           = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes       = dxpy.DXFile(chrom_sizes)
    narrowPeak_as     = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as     = dxpy.DXFile(gappedpeak_as)
    broadPeak_as      = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(),        experiment.name)
    dxpy.download_dxfile(control.get_id(),           control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(),       chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(),     narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(),     gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(),      broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn    = "%s/%s.narrowPeak" %(peaks_dirname, prefix)
    gappedPeak_fn    = "%s/%s.gappedPeak" %(peaks_dirname, prefix)
    broadPeak_fn     = "%s/%s.broadPeak"  %(peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn  = broadPeak_fn  + ".gz"
    narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn)
    broadPeak_bb_fn  = "%s.bb" %(broadPeak_fn)
    fc_signal_fn     = "%s/%s.fc_signal.bw"     %(peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" %(peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name,'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2] #third column
        print "Fraglen %s" %(fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip('%s/%s_peaks.narrowPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_narrowpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(narrowPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_broadpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(broadPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5)

    pipe = ['sort -k 14gr,14gr %s' %(rescaled_gappedpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(gappedPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
              '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"
    
    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_FE.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.fc.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph
    
    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe([
        'gzip -dc %s' %(experiment.name),
        'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe([
        'gzip -dc %s' %(control.name),
        'wc -l'])
    controlReads = out.strip()
    sval=str(min(float(chipReads), float(controlReads))/1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval)

    returncode = common.block_on(
        'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
        '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.pval.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' %(narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' %(gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3')
    broadPeak_bb_fname =  common.bed2bb('%s' %(broadPeak_fn),  chrom_sizes.name, broadPeak_as.name,  bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit 
    # for fn in [narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn]:
    #     common.block_on('touch %s' %(fn))

    # Upload the file outputs

    narrowPeak    = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak    = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak     = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb  = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal     = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks":    dxpy.dxlink(narrowPeak),
        "gappedpeaks":    dxpy.dxlink(gappedPeak),
        "broadpeaks":     dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb":  dxpy.dxlink(broadPeak_bb),
        "fc_signal":     dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
コード例 #24
0
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                       pooledpr1_peaks, pooledpr2_peaks,
                       rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                       paired_end, chrom_sizes, as_file, peak_type, prefix,
                       fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep2_ta_file         = dxpy.DXFile(rep2_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file       = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn         = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn       = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn       = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print(
        "%d peaks overlap with both pooled pseudoreplicates"
        % (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in        = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out       = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected  = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output