def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None): return_string = \ "\t\ttrack %s%d\n" %(accession,n) + \ "\t\tbigDataUrl %s\n" %(url) + \ "\t\tshortLabel %s\n" %(name[:17]) + \ "\t\tparent %sviewpeaks on\n" %(accession) + \ "\t\ttype %s\n" %(tracktype) + \ "\t\tvisibility dense\n" + \ "\t\tview PK\n" + \ "\t\tpriority %d\n\n" %(n) n_stanzas = 1 if not lowpass: lowpass = [] if isinstance(lowpass,int): lowpass = [lowpass] extra_stanza_count = 0 for (i, cutoff) in enumerate(lowpass,start=1): fn = dx.get_id() if not os.path.isfile(fn): dxpy.download_dxfile(dx.get_id(),fn) cutoffstr = '-lt%d' %(cutoff) outfn = fn + cutoffstr print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0] bed_fn = fn + '.bed' common.block_on('bigBedToBed %s %s' %(fn, bed_fn)) common.run_pipe([ 'cat %s' %(bed_fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn) print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0] if tracktype =='bigBed 6 +': as_file = 'narrowPeak.as' elif tracktype == 'bigBed 12 +': as_file = 'gappedPeak.as' else: print "Cannot match tracktype %s to any .as file" %(tracktype) bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file) newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True) new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True) new_lines = [ "\t\ttrack %s%d" %(accession,n+i), "\t\tbigDataUrl %s" %(new_url), "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr), "\t\tparent %sviewpeaks on" %(accession), "\t\ttype %s" %(tracktype), "\t\tvisibility dense", "\t\tview PK", "\t\tpriority %d\n\n" %(n+i)] new_stanza = '\n'.join(new_lines) return_string += new_stanza n_stanzas += 1 os.remove(bed_fn) os.remove(bb_fn) os.remove(outfn) os.remove(fn) return(return_string, n_stanzas)
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None): return_string = \ "\t\ttrack %s%d\n" %(accession,n) + \ "\t\tbigDataUrl %s\n" %(url) + \ "\t\tshortLabel %s\n" %(name[:17]) + \ "\t\tparent %sviewpeaks on\n" %(accession) + \ "\t\ttype %s\n" %(tracktype) + \ "\t\tvisibility dense\n" + \ "\t\tview PK\n" + \ "\t\tpriority %d\n\n" %(n) n_stanzas = 1 if not lowpass: lowpass = [] if isinstance(lowpass,int): lowpass = [lowpass] extra_stanza_count = 0 for (i, cutoff) in enumerate(lowpass,start=1): fn = dx.get_id() if not os.path.isfile(fn): dxpy.download_dxfile(dx.get_id(),fn) cutoffstr = '-lt%d' %(cutoff) outfn = fn + cutoffstr print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0] bed_fn = fn + '.bed' common.block_on('bigBedToBed %s %s' %(fn, bed_fn)) common.run_pipe([ 'cat %s' %(bed_fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn) print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0] if tracktype =='bigBed 6 +': as_file = 'narrowPeak.as' elif tracktype == 'bigBed 12 +': as_file = 'gappedPeak.as' else: print "Cannot match tracktype %s to any .as file" %(tracktype) bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file) newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True) new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True) new_lines = [ "\t\ttrack %s%dp%d" %(accession,n,i), "\t\tbigDataUrl %s" %(new_url), "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr), "\t\tparent %sviewpeaks on" %(accession), "\t\ttype %s" %(tracktype), "\t\tvisibility dense", "\t\tview PK", "\t\tpriority %d.%d\n\n" %(n,i)] new_stanza = '\n'.join(new_lines) return_string += new_stanza n_stanzas += 1 return(return_string, n_stanzas)
def upload(self, uploader): # Information about called peaks n_spp_peaks = common.count_lines(self.peaks_fn) print "%s peaks called by spp" % n_spp_peaks print "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(self.fixed_peaks_fn)) print "First 50 peaks" print subprocess.check_output('head -50 %s' % self.fixed_peaks_fn, shell=True, stderr=subprocess.STDOUT) # Upload bigBed if applicable if self.bigbed: self.peaks_bb_fn = common.bed2bb(self.fixed_peaks_fn, self.chrom_sizes.name, self.as_file.name) if self.peaks_bb_fn: self.peaks_bb = uploader.upload(self.peaks_bb_fn) if not filecmp.cmp(self.peaks_fn, self.fixed_peaks_fn): print "Returning peaks with fixed coordinates" # Upload peaks print subprocess.check_output(shlex.split("gzip %s" % self.fixed_peaks_fn)) self.peaks = uploader.upload(self.fixed_peaks_fn + ".gz") # Upload cross-correlations self.xcor_plot = uploader.upload(self.xcor_plot) self.xcor_scores = uploader.upload(self.xcor_scores)
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info( "%d peaks blacklisted from the stable set" % (N1-Nsb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( r1pr_peaks_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update( {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads" : rep1_n_reads, "rep1_frip_nreads_in_peaks" : rep1_n_reads_in_peaks, "F1" : rep1_frip_score, "rep2_frip_nreads" : rep2_n_reads, "rep2_frip_nreads_in_peaks" : rep2_n_reads_in_peaks, "F2" : rep2_frip_score, "true_frip_nreads" : true_n_reads, "true_frip_nreads_in_peaks" : true_n_reads_in_peaks, "Ft" : true_frip_score, "pr_frip_nreads" : pr_n_reads, "pr_frip_nreads_in_peaks" : pr_n_reads_in_peaks, "Fp" : pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" % ( common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" % ( common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" % ( common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, 'npeaks_rejected': npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe([ 'cat %s' % (overlap_tr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def process(self): ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if self.peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif self.peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif self.peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: print "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." sys.exit() # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn), 'sort -u' ], self.overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn) ], self.rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn)) self.npeaks_in = common.count_lines(common.uncompress(self.pooled_peaks_fn)) self.npeaks_out = common.count_lines(self.overlapping_peaks_fn) self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn) #make bigBed files for visualization self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type) self.rejected_peaks_bb_fn = common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) print "Read fragment length: %d" %(fragment_length) #run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz' if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' #install spp subprocess.check_call('ls -l', shell=True) subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball))) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename) print spp_command # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # for line in iter(process.stdout.readline, ''): # sys.stdout.write(line) subprocess.check_call(shlex.split(spp_command)) #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation #this changes any such coodinates to decimal notation #this assumes 10-column output and that the 2nd and 3rd columns are coordinates #slopBed adjusts feature end coordinates that go off the end of the chromosome #bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" %(final_peaks_filename), "tee %s" %(peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename), 'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename) ]) #These lines transfer the peaks files to the temporary workspace for debugging later #Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" %(n_spp_peaks) print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)) print "First 50 peaks" print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' %(rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' %(rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' %(pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' %(pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' %(pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' %(peak_type) # Output filenames if prefix: basename = prefix else: m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' %(basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' %(basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' %(peak_type) overlap_pr_fn = 'replicated_pr.%s' %(peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, 'npeaks_rejected' : npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign") peaks_filename = output_filename_prefix + ".regionPeak" final_peaks_filename = peaks_filename + ".gz" # spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + ".pdf" xcor_scores_filename = output_filename_prefix + ".ccscores" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, "r") as f: line = f.readline() fragment_length = int(line.split("\t")[fraglen_column - 1]) print "Read fragment length: %d" % (fragment_length) # run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz" if nodups: run_spp = "/phantompeakqualtools/run_spp_nodups.R" else: run_spp = "/phantompeakqualtools/run_spp.R" # install spp print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % ( run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename, ) print spp_command process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) for line in iter(process.stdout.readline, ""): sys.stdout.write(line) # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are coordinates # slopBed adjusts feature end coordinates that go off the end of the chromosome # bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak" out, err = common.run_pipe( [ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename), "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename), ] ) # These lines transfer the peaks files to the temporary workspace for debugging later # Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" % (n_spp_peaks) print "%s of those peaks removed due to bad coordinates" % ( n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename) ) print "First 50 peaks" print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + ".gz" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb)) else: subprocess.check_output( shlex.split('cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(r1pr_peaks_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info("%d peaks blacklisted from the conservative set" % (Nt - Ncb)) else: subprocess.check_output( shlex.split('cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No - Nob)) else: subprocess.check_output( shlex.split('cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads": rep1_n_reads, "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks, "F1": rep1_frip_score, "rep2_frip_nreads": rep2_n_reads, "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks, "F2": rep2_frip_score, "true_frip_nreads": true_n_reads, "true_frip_nreads_in_peaks": true_n_reads_in_peaks, "Ft": true_frip_score, "pr_frip_nreads": pr_n_reads, "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks, "Fp": pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(peaks_to_filter_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile( xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info(subprocess.check_output( 'ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) logger.info("Read fragment length: %d" % (fragment_length)) spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output( 'head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None, fragment_length=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info( subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file # if fragment_length is provided, use that. Else read # fragment length from xcor file if fragment_length is not None: fraglen = str(fragment_length) logger.info("User given fragment length %s" % (fraglen)) else: fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fraglen = line.split('\t')[fraglen_column - 1] logger.info("Read fragment length: %s" % (fraglen)) # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) run_spp = '/phantompeakqualtools/run_spp.R' spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call( shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) subprocess.check_output('set -x; ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) logging.info("Exiting with output: %s", output) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" % (Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" % (N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" % (N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" % (Np) conservative_set_filename = '%s_final_conservative.narrowPeak' % ( experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" % (Nt - Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" % (No - Nob) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file( conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility }) logging.info("Exiting with output: %s", output) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' %(blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' %(reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" %(Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" %(N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" %(N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" %(Np) conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" %(Nt-Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" %(No-Nob) rescue_ratio = float(max(Np,Nt)) / float(min(Np,Nt)) self_consistency_ratio = float(max(N1,N2)) / float(min(N1,N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename) output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) logging.info("Exiting with output: %s", output) return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. experiment = dxpy.DXFile(experiment) control = dxpy.DXFile(control) xcor_scores_input = dxpy.DXFile(xcor_scores_input) chrom_sizes = dxpy.DXFile(chrom_sizes) narrowPeak_as = dxpy.DXFile(narrowpeak_as) gappedPeak_as = dxpy.DXFile(gappedpeak_as) broadPeak_as = dxpy.DXFile(broadpeak_as) # Download the file inputs to the local file system # and use their own filenames. dxpy.download_dxfile(experiment.get_id(), experiment.name) dxpy.download_dxfile(control.get_id(), control.name) dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name) dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name) dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name) dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name) #Define the output filenames peaks_dirname = 'peaks' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix = experiment.name if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn) gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn) broadPeak_bb_fn = "%s.bb" % (broadPeak_fn) fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix) #Extract the fragment length estimate from column 3 of the cross-correlation scores file with open(xcor_scores_input.name, 'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] #third column print "Fraglen %s" % (fraglen) #=========================================== # Generate narrow peaks and preliminary signal tracks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (narrowPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # Generate Broad and Gapped Peaks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (broadPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn)) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' % (peaks_dirname, prefix), scores_col=5) pipe = [ 'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (gappedPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_FE.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(fc_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #============================================ # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l']) controlReads = out.strip() sval = str(min(float(chipReads), float(controlReads)) / 1000000) print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads, controlReads, sval) returncode = common.block_on( 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_ppois.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(pvalue_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn), chrom_sizes.name, broadPeak_as.name, bed_type='bed6+3') #Temporary during development to create empty files just to get the applet to exit for fn in [ narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn ]: common.block_on('touch %s' % (fn)) # Upload the file outputs narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn) gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn) broadPeak = dxpy.upload_local_file(broadPeak_gz_fn) narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn) gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn) broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn) fc_signal = dxpy.upload_local_file(fc_signal_fn) pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn) # Build the output structure. output = { "narrowpeaks": dxpy.dxlink(narrowPeak), "gappedpeaks": dxpy.dxlink(gappedPeak), "broadpeaks": dxpy.dxlink(broadPeak), "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb), "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb), "broadpeaks_bb": dxpy.dxlink(broadPeak_bb), "fc_signal": dxpy.dxlink(fc_signal), "pvalue_signal": dxpy.dxlink(pvalue_signal) } return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, prefix=None, fragment_length=None): narrowPeak_as = narrowpeak_as gappedPeak_as = gappedpeak_as broadPeak_as = broadpeak_as # Define the output filenames peaks_dirname = 'peaks_macs' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) if not prefix: prefix = experiment if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file # if the fragment_length argument is given, use that instead if fragment_length is not None: fraglen = str(fragment_length) logger.info("User given fragment length %s" % fraglen) else: with open(xcor_scores_input, 'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] # third column logger.info("Fraglen %s" % (fraglen)) # =========================================== # Generate narrow peaks and preliminary signal tracks # ============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' % (experiment, control) + \ '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genomesize, fraglen) logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_narrowpeak_fn = common.slop_clip( '%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), chrom_sizes) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 # with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (narrowPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn)) # remove additional files # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed # =========================================== # Generate Broad and Gapped Peaks # ============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' % (experiment, control) + \ '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' % (genomesize, fraglen) logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_broadpeak_fn = common.slop_clip( '%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), chrom_sizes) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending # order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (broadPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn)) # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' % (peaks_dirname, prefix), chrom_sizes, bed_type='gappedPeak') # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5) pipe = [ 'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (gappedPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn)) # remove additional files # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed # =========================================== # For Fold enrichment signal tracks # ============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), # Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' % (peaks_dirname, prefix) + \ '-m FE' logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_FE.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' % (chrom_sizes, peaks_dirname, prefix) ] out, err = common.run_pipe(pipe) # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' % (peaks_dirname, prefix) + \ '%s ' % (chrom_sizes) + \ '%s' % (fc_signal_fn) logger.info(command) returncode = common.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # drm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph # =========================================== # For -log10(p-value) signal tracks # ============================================ # Compute sval = # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l']) controlReads = out.strip() sval = str(min(float(chipReads), float(controlReads)) / 1000000) logger.info("chipReads = %s, controlReads = %s, sval = %s" % (chipReads, controlReads, sval)) returncode = common.block_on('macs2 bdgcmp ' + '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + '--outdir %s -o %s_ppois.bdg ' % (peaks_dirname, prefix) + '-m ppois -S %s' % (sval)) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_ppois.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' % (chrom_sizes, peaks_dirname, prefix) ] out, err = common.run_pipe(pipe) # rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' % (peaks_dirname, prefix) + \ '%s ' % (chrom_sizes) + \ '%s' % (pvalue_signal_fn) logger.info(command) returncode = common.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg # =========================================== # Generate bigWigs from beds to support trackhub visualization of peak files # ============================================ narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn), chrom_sizes, narrowPeak_as, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn), chrom_sizes, gappedPeak_as, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn), chrom_sizes, broadPeak_as, bed_type='bed6+3') # Temporary during development to create empty files just to get the applet # to exit # narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn) # gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn) # broadPeak_bb_fn = "%s.bb" % (broadPeak_fn) output = { "narrowpeaks": narrowPeak_gz_fn, "gappedpeaks": gappedPeak_gz_fn, "broadpeaks": broadPeak_gz_fn, "narrowpeaks_bb": narrowPeak_bb_fname, "gappedpeaks_bb": gappedPeak_bb_fname, "broadpeaks_bb": broadPeak_bb_fname, "fc_signal": fc_signal_fn, "pvalue_signal": pvalue_signal_fn } return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'], overlapping_peaks_fn) print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. experiment = dxpy.DXFile(experiment) control = dxpy.DXFile(control) xcor_scores_input = dxpy.DXFile(xcor_scores_input) chrom_sizes = dxpy.DXFile(chrom_sizes) narrowPeak_as = dxpy.DXFile(narrowpeak_as) gappedPeak_as = dxpy.DXFile(gappedpeak_as) broadPeak_as = dxpy.DXFile(broadpeak_as) # Download the file inputs to the local file system # and use their own filenames. dxpy.download_dxfile(experiment.get_id(), experiment.name) dxpy.download_dxfile(control.get_id(), control.name) dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name) dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name) dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name) dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name) #Define the output filenames peaks_dirname = 'peaks_macs' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix = experiment.name if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" %(peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" %(peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" %(peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn) gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn) broadPeak_bb_fn = "%s.bb" %(broadPeak_fn) fc_signal_fn = "%s/%s.fc_signal.bw" %(peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" %(peaks_dirname, prefix) #Extract the fragment length estimate from column 3 of the cross-correlation scores file with open(xcor_scores_input.name,'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] #third column print "Fraglen %s" %(fraglen) #=========================================== # Generate narrow peaks and preliminary signal tracks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_narrowpeak_fn = common.slop_clip('%s/%s_peaks.narrowPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = ['sort -k 8gr,8gr %s' %(rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(narrowPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(narrowPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # Generate Broad and Gapped Peaks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_broadpeak_fn = common.slop_clip('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = ['sort -k 8gr,8gr %s' %(rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(broadPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(broadPeak_gz_fn)) # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5) pipe = ['sort -k 14gr,14gr %s' %(rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(gappedPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(gappedPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = ['slopBed -i %s/%s_FE.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)] print pipe out, err = common.run_pipe(pipe) #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(fc_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" %(returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #============================================ # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe([ 'gzip -dc %s' %(experiment.name), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe([ 'gzip -dc %s' %(control.name), 'wc -l']) controlReads = out.strip() sval=str(min(float(chipReads), float(controlReads))/1000000) print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval) returncode = common.block_on( 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = ['slopBed -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)] print pipe out, err = common.run_pipe(pipe) #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(pvalue_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" %(returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = common.bed2bb('%s' %(narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' %(gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' %(broadPeak_fn), chrom_sizes.name, broadPeak_as.name, bed_type='bed6+3') #Temporary during development to create empty files just to get the applet to exit # for fn in [narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn]: # common.block_on('touch %s' %(fn)) # Upload the file outputs narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn) gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn) broadPeak = dxpy.upload_local_file(broadPeak_gz_fn) narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn) gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn) broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn) fc_signal = dxpy.upload_local_file(fc_signal_fn) pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn) # Build the output structure. output = { "narrowpeaks": dxpy.dxlink(narrowPeak), "gappedpeaks": dxpy.dxlink(gappedPeak), "broadpeaks": dxpy.dxlink(broadPeak), "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb), "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb), "broadpeaks_bb": dxpy.dxlink(broadPeak_bb), "fc_signal": dxpy.dxlink(fc_signal), "pvalue_signal": dxpy.dxlink(pvalue_signal) } return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print( "%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output