def upload(self, uploader): # Upload file outputs from the local file system. self.overlapping_peaks = uploader.upload(common.compress(self.overlapping_peaks_fn)) self.overlapping_peaks_bb = uploader.upload(self.overlapping_peaks_bb_fn) self.rejected_peaks = uploader.upload(common.compress(self.rejected_peaks_fn)) self.rejected_peaks_bb = uploader.upload(self.rejected_peaks_bb_fn)
# Sort the termID-docID pairs by termID term_doc_list.sort(key=lambda tuple: tuple[0]) combined_term_doc_list = {} for termID, docID in term_doc_list: combined_term_doc_list.setdefault(termID, []).append(docID) print >> sys.stderr, 'print posting list to disc for dir:' + dir terms = combined_term_doc_list.keys() # Since sorted() takes much longer than .sort() terms.sort() for termID in terms: # write the posting lists to block_pl for this current block combined_term_doc_list[termID].sort() numPostings = len(combined_term_doc_list[termID]) common.writePostingsList( block_dict, block_pl, termID, common.compress(combined_term_doc_list[termID]), numPostings) block_pl.close() block_dict.close() print >> sys.stderr, '######\nposting list construction finished!\n##########' print >> sys.stderr, '\nMerging postings...' while True: if len(block_q) <= 1: break b1 = block_q.popleft() b2 = block_q.popleft() print >> sys.stderr, 'merging %s and %s' % (b1, b2) b1_f = open(out_dir + '/' + b1 + '.postings', 'rb') b2_f = open(out_dir + '/' + b2 + '.postings', 'rb') b1_dict_file = open(out_dir + '/' + b1 + '.dict', 'rb')
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" % (Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" % (N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" % (N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" % (Np) conservative_set_filename = '%s_final_conservative.narrowPeak' % ( experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" % (Nt - Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" % (No - Nob) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file( conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility }) logging.info("Exiting with output: %s", output) return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info( "%d peaks blacklisted from the stable set" % (N1-Nsb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( r1pr_peaks_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update( {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads" : rep1_n_reads, "rep1_frip_nreads_in_peaks" : rep1_n_reads_in_peaks, "F1" : rep1_frip_score, "rep2_frip_nreads" : rep2_n_reads, "rep2_frip_nreads_in_peaks" : rep2_n_reads_in_peaks, "F2" : rep2_frip_score, "true_frip_nreads" : true_n_reads, "true_frip_nreads_in_peaks" : true_n_reads_in_peaks, "Ft" : true_frip_score, "pr_frip_nreads" : pr_n_reads, "pr_frip_nreads_in_peaks" : pr_n_reads_in_peaks, "Fp" : pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" % ( common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" % ( common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" % ( common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, 'npeaks_rejected': npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
print >> sys.stderr, 'sorting term doc list for dir:' + dir term_doc_list = list(term_doc_list) # Sort the termID-docID pairs by termID term_doc_list.sort(key=lambda tuple:tuple[0]) combined_term_doc_list = {} for termID,docID in term_doc_list: combined_term_doc_list.setdefault(termID,[]).append(docID) print >> sys.stderr, 'print posting list to disc for dir:' + dir terms = combined_term_doc_list.keys() # Since sorted() takes much longer than .sort() terms.sort() for termID in terms: # write the posting lists to block_pl for this current block combined_term_doc_list[termID].sort() numPostings = len(combined_term_doc_list[termID]) common.writePostingsList( block_dict, block_pl, termID, common.compress(combined_term_doc_list[termID]), numPostings) block_pl.close() block_dict.close() print >> sys.stderr, '######\nposting list construction finished!\n##########' print >> sys.stderr, '\nMerging postings...' while True: if len(block_q) <= 1: break b1 = block_q.popleft() b2 = block_q.popleft() print >> sys.stderr, 'merging %s and %s' % (b1, b2)
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe([ 'cat %s' % (overlap_tr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' %(blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' %(reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" %(Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" %(N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" %(N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" %(Np) conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" %(Nt-Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" %(No-Nob) rescue_ratio = float(max(Np,Nt)) / float(min(Np,Nt)) self_consistency_ratio = float(max(N1,N2)) / float(min(N1,N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename) output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) logging.info("Exiting with output: %s", output) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) subprocess.check_output('set -x; ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) logging.info("Exiting with output: %s", output) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' %(rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' %(rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' %(pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' %(pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' %(pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' %(peak_type) # Output filenames if prefix: basename = prefix else: m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' %(basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' %(basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' %(peak_type) overlap_pr_fn = 'replicated_pr.%s' %(peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, 'npeaks_rejected' : npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb)) else: subprocess.check_output( shlex.split('cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(r1pr_peaks_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info("%d peaks blacklisted from the conservative set" % (Nt - Ncb)) else: subprocess.check_output( shlex.split('cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No - Nob)) else: subprocess.check_output( shlex.split('cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads": rep1_n_reads, "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks, "F1": rep1_frip_score, "rep2_frip_nreads": rep2_n_reads, "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks, "F2": rep2_frip_score, "true_frip_nreads": true_n_reads, "true_frip_nreads_in_peaks": true_n_reads_in_peaks, "Ft": true_frip_score, "pr_frip_nreads": pr_n_reads, "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks, "Fp": pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(peaks_to_filter_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'], overlapping_peaks_fn) print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print( "%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output