def __init__(self, ref_prefix, max_n): self.max_n = max_n # use up to max_n gram # reference files self.files = self.get_ref_files(ref_prefix) # number of references per file self.nref = count_lines(self.files[0]) for filename in self.files: n = count_lines(filename) assert self.nref == n, '%s has %s lines' % (filename, n) # counters for ngrams self.counters = [RefCounter(max_n) for i in range(self.nref)] self.load()
def __init__(self, ffilename, efilename, afilename, outputdir, alpha, threshold, length_factor=False, lexical_weighter=None, maximize_derivation=False): self.ffilename = ffilename self.efilename = efilename self.afilename = afilename self.outputdir = outputdir self.alpha = alpha self.threshold = threshold self.length_factor = length_factor self.lexical_weighter = lexical_weighter self.maximize_derivation = maximize_derivation self.counter = RuleCounter() self.corpus_size = count_lines(ffilename) system('rm -rf %s' % outputdir) system('mkdir %s' % outputdir)
def read_table(json_table_path): with open(json_table_path) as table_file: tables = {} for line in tqdm(table_file, total=count_lines(json_table_path)): d = json.loads(line) tables[d['id']] = d return tables
def __init__(self, total=-1, input='', file=stdout): assert total == -1 or input == '', \ "user should specify either 'total' or 'input'" if total != -1: self.total = total elif input: self.total = count_lines(input) else: assert False, "please specify either 'total' or 'input'" self.percent = 0 self.file = file
def upload(self, uploader): # Information about called peaks n_spp_peaks = common.count_lines(self.peaks_fn) print "%s peaks called by spp" % n_spp_peaks print "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(self.fixed_peaks_fn)) print "First 50 peaks" print subprocess.check_output('head -50 %s' % self.fixed_peaks_fn, shell=True, stderr=subprocess.STDOUT) # Upload bigBed if applicable if self.bigbed: self.peaks_bb_fn = common.bed2bb(self.fixed_peaks_fn, self.chrom_sizes.name, self.as_file.name) if self.peaks_bb_fn: self.peaks_bb = uploader.upload(self.peaks_bb_fn) if not filecmp.cmp(self.peaks_fn, self.fixed_peaks_fn): print "Returning peaks with fixed coordinates" # Upload peaks print subprocess.check_output(shlex.split("gzip %s" % self.fixed_peaks_fn)) self.peaks = uploader.upload(self.fixed_peaks_fn + ".gz") # Upload cross-correlations self.xcor_plot = uploader.upload(self.xcor_plot) self.xcor_scores = uploader.upload(self.xcor_scores)
def rescale_scores(fn, scores_col, new_min=10, new_max=1000): n_peaks = common.count_lines(fn) sorted_fn = 'sorted-%s' %(fn) rescaled_fn = 'rescaled-%s' %(fn) out,err = common.run_pipe([ 'sort -k %dgr,%dgr %s' %(scores_col, scores_col, fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], sorted_fn) out, err = common.run_pipe([ 'head -n 1 %s' %(sorted_fn), 'cut -f %s' %(scores_col)]) max_score = float(out.strip()) out, err = common.run_pipe([ 'tail -n 1 %s' %(sorted_fn), 'cut -f %s' %(scores_col)]) min_score = float(out.strip()) out,err = common.run_pipe([ 'cat %s' %(sorted_fn), r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \ r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)], rescaled_fn) return rescaled_fn
def rescale_scores(fn, scores_col, new_min=10, new_max=1000): n_peaks = common.count_lines(fn) sorted_fn = 'sorted-%s' % (fn) rescaled_fn = 'rescaled-%s' % (fn) out, err = common.run_pipe([ 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'""" ], sorted_fn) out, err = common.run_pipe( ['head -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) max_score = float(out.strip()) out, err = common.run_pipe( ['tail -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) min_score = float(out.strip()) out,err = common.run_pipe([ 'cat %s' %(sorted_fn), r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \ r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)], rescaled_fn) return rescaled_fn
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'], overlapping_peaks_fn) print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None, fragment_length=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info( subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file # if fragment_length is provided, use that. Else read # fragment length from xcor file if fragment_length is not None: fraglen = str(fragment_length) logger.info("User given fragment length %s" % (fraglen)) else: fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fraglen = line.split('\t')[fraglen_column - 1] logger.info("Read fragment length: %s" % (fraglen)) # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) run_spp = '/phantompeakqualtools/run_spp.R' spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call( shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def read_and_write_query(query_question_path, tables, question_output_path, sql_output_path, table_path, valid_file_counter, debug=False, do_append=False): sql_parser = SQLParser() if do_append: sql_writer = open(sql_output_path, 'a') question_writer = open(question_output_path, 'a') table_writer = open(table_path, 'a') else: sql_writer = open(sql_output_path, 'w') question_writer = open(question_output_path, 'w') table_writer = open(table_path, 'w') num_of_unicode_error = 0 num_of_non_parsable_error = 0 with open(query_question_path) as qq_file: queries = [] questions = [] counter = 0 for line in tqdm(qq_file, total=count_lines(query_question_path)): data = json.loads(line) question = data['question'] table_id = data['table_id'] table = tables[table_id] column_names = table["header"] #print(column_names) sql = data['sql'] select_col = table["header"][int(sql["sel"])] agg = agg_ops[int(sql["agg"])] conditions = sql["conds"] use_column_name = True query = Query(int(sql["sel"]), int(sql["agg"]), column_names, use_column_name, conditions) #print("select col: " + select_col) #print("agg: " + agg) #print(question) #print(sql) #print(col_names) hasError = False try: sql_query = query.__repr__() col_names = " COL_END COL_START ".join( str(x) for x in column_names) except: if debug: print("ERROR in line unicode" + str(counter)) hasError = True num_of_unicode_error += 1 if not hasError: try: #new_query, orig_table_name = fix_table_name(query) parse_tree, rule_list = sql_parser.parse(sql_query, get_rules=True) sql_writer.write(sql_query + "\n") question_writer.write(question + " COL_START " + col_names + " COL_END\n") valid_file_counter += 1 except: if debug: print("ERROR in line " + str(counter) + " :" + str(sql_query)) num_of_non_parsable_error += 1 counter += 1 #if counter == 10: # break print("Unicode error: " + str(num_of_unicode_error)) print("Nonparsable error: " + str(num_of_non_parsable_error)) return valid_file_counter
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, npeaks, nodups, chrom_sizes, spp_version, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, as_file=None, idr_peaks=False, fragment_length=None, spp_instance=None): rep1_ta_file = dxpy.DXFile(rep1_ta) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) rep1_ta_filename = rep1_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info( "No rep2 tags specified so processing as a simplicate experiment.") else: logger.info( "Rep1 and rep2 tags specified so processing as a replicated experiment." ) if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = dxpy.DXFile(rep2_ta) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) rep2_ta_filename = rep2_ta_file.name ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) ctl1_ta_filename = ctl1_ta_file.name if not unary_control: ctl2_ta_file = dxpy.DXFile(ctl2_ta) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) ctl2_ta_filename = ctl2_ta_file.name else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file.name ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, spp_version, name='Pool cross-correlation') if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info( "Using one control for both replicate 1 and 2 and for the pool." ) rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = pool_applet.run( { "inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls" }, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info( "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." ) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info( "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." ) rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info("Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" common_args = { 'chrom_sizes': chrom_sizes, 'spp_version': spp_version, 'as_file': as_file, 'spp_instance': spp_instance } if fragment_length is not None: common_args.update({'fragment_length': fragment_length}) rep1_peaks_subjob = spp(rep1_ta, rep1_control, rep1_xcor, bigbed=True, name='Rep1 peaks vs %s' % (rep1_ctl_msg), prefix='R1', **common_args) if not simplicate_experiment: rep2_peaks_subjob = spp(rep2_ta, rep2_control, rep2_xcor, bigbed=True, name='Rep2 peaks vs %s' % (rep2_ctl_msg), prefix='R2', **common_args) pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=True, name='Pooled peaks vs %s' % (pool_ctl_msg), prefix='PL', **common_args) output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores") } if not simplicate_experiment: output.update({ 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") }) if idr_peaks: # also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = \ dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep1_ta, "prefix": 'R1PR'}, name='Pseudoreplicate rep1 -> R1PR1,2') rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, bigbed=False, name='R1PR1 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR1', **common_args) rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, bigbed=False, name='R1PR2 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR2', **common_args) output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks") }) if not simplicate_experiment: rep2_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep2_ta, "prefix": 'R2PR'}, name='Pseudoreplicate rep2 -> R2PR1,2') pool_pr1_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ], "prefix": 'PPR1' }, name='Pool R1PR1+R2PR1 -> PPR1') pool_pr2_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ], "prefix": 'PPR2' }, name='Pool R1PR2+R2PR2 -> PPR2') rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, bigbed=False, name='R2PR1 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR1', **common_args) rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, bigbed=False, name='R2PR2 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR2', **common_args) pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR1 peaks vs %s' % (pool_ctl_msg), prefix='PPR1', **common_args) pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR2 peaks vs %s' % (pool_ctl_msg), prefix='PPR2', **common_args) output.update({ 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), }) return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, chrom_sizes, genomesize, narrowpeak_as, gappedpeak_as, broadpeak_as, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, fragment_length=None): rep1_ta_file = rep1_ta rep1_ta_filename = rep1_ta_file ntags_rep1 = common.count_lines(rep1_ta_filename) # simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info( "No rep2 tags specified so processing as a simplicate experiment.") else: logger.info( "Rep1 and rep2 tags specified so processing as a replicated experiment." ) # if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = rep2_ta rep2_ta_filename = rep2_ta_file ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end # unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = ctl1_ta ctl1_ta_filename = ctl1_ta_file # if not unary_control: ctl2_ta_file = ctl2_ta ctl2_ta_filename = ctl2_ta_file else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file # ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" # rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) # subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) # if not simplicate_experiment: #Pool replicates pool_replicates_subjob = pool(inputs=[rep1_ta, rep2_ta], prefix='pooled_reps') pooled_replicates = pool_replicates_subjob.get("pooled") #Pool cross-correlation pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end) # if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info( "Using one control for both replicate 1 and 2 and for the pool." ) rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: #Pool controls pool_controls_subjob = pool(inputs=[ctl1_ta, ctl2_ta], prefix="PL_ctls") pooled_controls = pool_controls_subjob.get("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info( "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." ) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info( "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." ) rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info("Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" # rep1_pr_subjob = pseudoreplicator(input_tags=rep1_ta) if not simplicate_experiment: rep2_pr_subjob = pseudoreplicator(input_tags=rep2_ta) # pool_pr1_subjob = pool(inputs=[ rep1_pr_subjob.get("pseudoreplicate1"), rep2_pr_subjob.get("pseudoreplicate1") ], prefix='PPR1') pool_pr2_subjob = pool(inputs=[ rep1_pr_subjob.get("pseudoreplicate2"), rep2_pr_subjob.get("pseudoreplicate2") ], prefix='PPR2') # common_args = { 'chrom_sizes': chrom_sizes, 'genomesize': genomesize, 'narrowpeak_as': narrowpeak_as, 'gappedpeak_as': gappedpeak_as, 'broadpeak_as': broadpeak_as } # if the fragment_length argument is given, update macs2 input if fragment_length is not None: common_args.update({'fragment_length': fragment_length}) #macs2(experiment, control, xcor_scores_input, chrom_sizes,narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, prefix=None,fragment_length=None) common_args.update({'prefix': 'r1'}) rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args) # common_args.update({'prefix': 'r1pr1'}) rep1pr1_peaks_subjob = macs2(rep1_pr_subjob.get("pseudoreplicate1"), rep1_control, rep1_xcor, **common_args) # common_args.update({'prefix': 'r1pr2'}) rep1pr2_peaks_subjob = macs2(rep1_pr_subjob.get("pseudoreplicate2"), rep1_control, rep1_xcor, **common_args) # if not simplicate_experiment: common_args.update({'prefix': 'r2'}) rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor, **common_args) # common_args.update({'prefix': 'r2pr1'}) rep2pr1_peaks_subjob = macs2(rep2_pr_subjob.get("pseudoreplicate1"), rep2_control, rep2_xcor, **common_args) # common_args.update({'prefix': 'r2pr2'}) rep2pr2_peaks_subjob = macs2(rep2_pr_subjob.get("pseudoreplicate2"), rep2_control, rep2_xcor, **common_args) # common_args.update({'prefix': 'pool'}) pooled_peaks_subjob = macs2( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args) # common_args.update({'prefix': 'ppr1'}) pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args) # common_args.update({'prefix': 'ppr2'}) pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args) # output = { 'rep1_narrowpeaks': rep1_peaks_subjob.get("narrowpeaks"), 'rep1_gappedpeaks': rep1_peaks_subjob.get("gappedpeaks"), 'rep1_broadpeaks': rep1_peaks_subjob.get("broadpeaks"), 'rep1_narrowpeaks_bb': rep1_peaks_subjob.get("narrowpeaks_bb"), 'rep1_gappedpeaks_bb': rep1_peaks_subjob.get("gappedpeaks_bb"), 'rep1_broadpeaks_bb': rep1_peaks_subjob.get("broadpeaks_bb"), 'rep1_fc_signal': rep1_peaks_subjob.get("fc_signal"), 'rep1_pvalue_signal': rep1_peaks_subjob.get("pvalue_signal"), # 'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get("narrowpeaks"), 'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get("gappedpeaks"), 'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get("broadpeaks"), 'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get("fc_signal"), 'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get("pvalue_signal"), # 'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get("narrowpeaks"), 'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get("gappedpeaks"), 'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get("broadpeaks"), 'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get("fc_signal"), 'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get("pvalue_signal") } # if not simplicate_experiment: output.update({ 'rep2_narrowpeaks': rep2_peaks_subjob.get("narrowpeaks"), 'rep2_gappedpeaks': rep2_peaks_subjob.get("gappedpeaks"), 'rep2_broadpeaks': rep2_peaks_subjob.get("broadpeaks"), 'rep2_narrowpeaks_bb': rep2_peaks_subjob.get("narrowpeaks_bb"), 'rep2_gappedpeaks_bb': rep2_peaks_subjob.get("gappedpeaks_bb"), 'rep2_broadpeaks_bb': rep2_peaks_subjob.get("broadpeaks_bb"), 'rep2_fc_signal': rep2_peaks_subjob.get("fc_signal"), 'rep2_pvalue_signal': rep2_peaks_subjob.get("pvalue_signal"), # 'rep2pr1_narrowpeaks': rep2pr1_peaks_subjob.get("narrowpeaks"), 'rep2pr1_gappedpeaks': rep2pr1_peaks_subjob.get("gappedpeaks"), 'rep2pr1_broadpeaks': rep2pr1_peaks_subjob.get("broadpeaks"), 'rep2pr1_fc_signal': rep2pr1_peaks_subjob.get("fc_signal"), 'rep2pr1_pvalue_signal': rep2pr1_peaks_subjob.get("pvalue_signal"), # 'rep2pr2_narrowpeaks': rep2pr2_peaks_subjob.get("narrowpeaks"), 'rep2pr2_gappedpeaks': rep2pr2_peaks_subjob.get("gappedpeaks"), 'rep2pr2_broadpeaks': rep2pr2_peaks_subjob.get("broadpeaks"), 'rep2pr2_fc_signal': rep2pr2_peaks_subjob.get("fc_signal"), 'rep2pr2_pvalue_signal': rep2pr2_peaks_subjob.get("pvalue_signal"), # 'pooled_narrowpeaks': pooled_peaks_subjob.get("narrowpeaks"), 'pooled_gappedpeaks': pooled_peaks_subjob.get("gappedpeaks"), 'pooled_broadpeaks': pooled_peaks_subjob.get("broadpeaks"), 'pooled_narrowpeaks_bb': pooled_peaks_subjob.get("narrowpeaks_bb"), 'pooled_gappedpeaks_bb': pooled_peaks_subjob.get("gappedpeaks_bb"), 'pooled_broadpeaks_bb': pooled_peaks_subjob.get("broadpeaks_bb"), 'pooled_fc_signal': pooled_peaks_subjob.get("fc_signal"), 'pooled_pvalue_signal': pooled_peaks_subjob.get("pvalue_signal"), # 'pooledpr1_narrowpeaks': pooledpr1_peaks_subjob.get("narrowpeaks"), 'pooledpr1_gappedpeaks': pooledpr1_peaks_subjob.get("gappedpeaks"), 'pooledpr1_broadpeaks': pooledpr1_peaks_subjob.get("broadpeaks"), 'pooledpr1_fc_signal': pooledpr1_peaks_subjob.get("fc_signal"), 'pooledpr1_pvalue_signal': pooledpr1_peaks_subjob.get("pvalue_signal"), # 'pooledpr2_narrowpeaks': pooledpr2_peaks_subjob.get("narrowpeaks"), 'pooledpr2_gappedpeaks': pooledpr2_peaks_subjob.get("gappedpeaks"), 'pooledpr2_broadpeaks': pooledpr2_peaks_subjob.get("broadpeaks"), 'pooledpr2_fc_signal': pooledpr2_peaks_subjob.get("fc_signal"), 'pooledpr2_pvalue_signal': pooledpr2_peaks_subjob.get("pvalue_signal") }) peaks_dirname = '%s_%s_peaks_macs' % (rep1_ta_filename.split( "/")[-1].split(".")[0], ctl1_ta_filename.split("/")[-1].split(".")[0]) prefix = rep1_ta_filename.split("/")[-1] peak_file = "%s/%s.peaksfile" % (peaks_dirname, prefix) with open(peak_file, "w") as fh: for key, val in output.items(): if isinstance(val, list): fh.write(": ".join([key, ", ".join(val)]) + "\n") else: fh.write(": ".join([key, str(val)]) + "\n") return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info( "%d peaks blacklisted from the stable set" % (N1-Nsb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( r1pr_peaks_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update( {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign") peaks_filename = output_filename_prefix + ".regionPeak" final_peaks_filename = peaks_filename + ".gz" # spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + ".pdf" xcor_scores_filename = output_filename_prefix + ".ccscores" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, "r") as f: line = f.readline() fragment_length = int(line.split("\t")[fraglen_column - 1]) print "Read fragment length: %d" % (fragment_length) # run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz" if nodups: run_spp = "/phantompeakqualtools/run_spp_nodups.R" else: run_spp = "/phantompeakqualtools/run_spp.R" # install spp print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % ( run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename, ) print spp_command process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) for line in iter(process.stdout.readline, ""): sys.stdout.write(line) # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are coordinates # slopBed adjusts feature end coordinates that go off the end of the chromosome # bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak" out, err = common.run_pipe( [ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename), "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename), ] ) # These lines transfer the peaks files to the temporary workspace for debugging later # Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" % (n_spp_peaks) print "%s of those peaks removed due to bad coordinates" % ( n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename) ) print "First 50 peaks" print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + ".gz" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' %(rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' %(rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' %(pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' %(pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' %(pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' %(peak_type) # Output filenames if prefix: basename = prefix else: m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' %(basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' %(basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' %(peak_type) overlap_pr_fn = 'replicated_pr.%s' %(peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, 'npeaks_rejected' : npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, fragment_length=None): r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) rep1_ta = dxpy.DXFile(rep1_ta) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # If fragment_length is given, override appropriate values. # Calculate, or set the actually used fragment length value. # Set the fragment_length_given_by_user flag appropriately. if fragment_length is not None: rep1_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_given_by_user = True else: rep1_xcor = dxpy.DXFile(rep1_xcor) rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name) dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_given_by_user = False subprocess.check_output('set -x; ls -l', shell=True) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) stable_set_filename = "%s_stable.narrowPeak" % (experiment) if blacklist is not None: blacklist_filter(r1pr_peaks_filename, stable_set_filename, blacklist_filename) Nsb = common.count_lines(stable_set_filename) logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb)) else: subprocess.check_output( shlex.split('cp %s %s' % (r1pr_peaks_filename, stable_set_filename))) Nsb = N1 logger.info("No blacklist filter applied to the stable set") # calculate FRiP n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, stable_set_filename, chrom_sizes_filename, fragment_length_used_rep1) output = { "rep1_frip_nreads": n_reads, "rep1_frip_nreads_in_peaks": n_reads_in_peaks, "F1": frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(r1pr_peaks_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does stable_set_bb_filename = \ common.bed2bb(stable_set_filename, chrom_sizes_filename, as_file_filename) if stable_set_bb_filename: stable_set_bb_output = \ dxpy.upload_local_file(stable_set_bb_filename) output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)}) output.update({ "N1": N1, "stable_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(stable_set_filename))), "Ns": Nsb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info("%d peaks blacklisted from the conservative set" % (Nt - Ncb)) else: subprocess.check_output( shlex.split('cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No - Nob)) else: subprocess.check_output( shlex.split('cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads": rep1_n_reads, "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks, "F1": rep1_frip_score, "rep2_frip_nreads": rep2_n_reads, "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks, "F2": rep2_frip_score, "true_frip_nreads": true_n_reads, "true_frip_nreads_in_peaks": true_n_reads_in_peaks, "Ft": true_frip_score, "pr_frip_nreads": pr_n_reads, "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks, "Fp": pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(peaks_to_filter_filename))) }) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(rep1_ta, ctl1_ta, rep1_paired_end, rep2_ta=None, ctl2_ta=None, rep2_paired_end=None): rep1_ta_filename = rep1_ta ntags_rep1 = common.count_lines(rep1_ta_filename) output = {'rep1_ta': rep1_ta_filename} simplicate_experiment = rep1_ta and not rep2_ta output.update({'simplicate_experiment': simplicate_experiment}) if simplicate_experiment: logger.info("No rep2 tags specified so processing as a simplicate experiment.") else: logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.") output.update({'rep2_ta': rep2_ta}) if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_filename = rep2_ta ntags_rep2 = common.count_lines(rep2_ta_filename) output.update({'rep2_ta': rep2_ta_filename}) paired_end = rep1_paired_end output.update({'paired_end': paired_end}) unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_filename = ctl1_ta if not unary_control: ctl2_ta_filename = ctl2_ta else: ctl2_ta_filename = ctl1_ta ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. output.update({'rep1_control': rep1_control, 'rep2_control': rep2_control}) rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend( [(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_replicates_subjob = \ pool(**{"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}) pooled_replicates = pool_replicates_subjob.get("pooled") output.update({'pooled_replicates': pooled_replicates}) # this needs to go to the other image ''' pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, name='Pool cross-correlation') ''' if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info("Using one control for both replicate 1 and 2 and for the pool.") rep2_control = rep1_control control_for_pool = rep1_control output.update({'rep2_control': rep2_control, 'control_for_pool': rep1_control}) else: pool_controls_subjob = pool( **{"inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls"}) pooled_controls = pool_controls_subjob.get("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls output.update({'control_for_pool': control_for_pool}) # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1/ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls output.update({'rep1_control': pooled_controls, 'rep2_control': pooled_controls}) else: if ntags_ctl1 < ntags_rep1: logger.info("Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1.") rep1_control = pooled_controls output.update({'rep1_control': pooled_controls}) elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info("Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2.") rep2_control = pooled_controls output.update({'rep2_control': pooled_controls}) else: logger.info( "Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta rep2_control = ctl2_ta output.update({'rep1_control': ctl1_ta, 'rep2_control': ctl2_ta}) rep1_pr_subjob = pseudoreplicator(**{"input_tags": rep1_ta}) r1pr1 = rep1_pr_subjob.get('pseudoreplicate1') r1pr2 = rep1_pr_subjob.get('pseudoreplicate2') output.update({'r1pr1': r1pr1, 'r1pr2': r1pr2}) if not simplicate_experiment: rep2_pr_subjob = pseudoreplicator(**{"input_tags": rep2_ta}) r2pr1 = rep2_pr_subjob.get('pseudoreplicate1') r2pr2 = rep2_pr_subjob.get('pseudoreplicate2') output.update({'r2pr1': r2pr1, 'r2pr2': r2pr2}) pool_pr1_subjob = pool( **{"inputs": [rep1_pr_subjob.get("pseudoreplicate1"), rep2_pr_subjob.get("pseudoreplicate1")], "prefix": 'PPR1'}) pool_pr2_subjob = pool( **{"inputs": [rep1_pr_subjob.get("pseudoreplicate2"), rep2_pr_subjob.get("pseudoreplicate2")], "prefix": 'PPR2'}) ppr1 = pool_pr1_subjob.get('pooled') ppr2 = pool_pr2_subjob.get('pooled') output.update({'ppr1': ppr1, 'ppr2': ppr2}) # should there be an indication of the simplicateness of the # experiment in the output json? this could be a good way to # direct the next step without putting too much logic into the # workflow. ADDED. # Turns out Cromwell does not support reading .json. Instead # they have read_map function that accepts 2 column TSVs. with open('pool_and_pseudoreplicate_outfiles.mapping', 'w') as f: for key in output: f.write('%s\t%s\n' % (key, output[key])) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" % ( common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" % ( common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" % ( common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, 'npeaks_rejected': npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) subprocess.check_output('set -x; ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) logging.info("Exiting with output: %s", output) return output
def replicated_IDR(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, blacklist, rep1_signal, rep2_signal, pooled_signal, fragment_length=None): # TODO for now just taking the peak files. This applet should actually # call IDR instead of putting that in the workflow populator script reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # next call could be on 267 and save time? pool_replicates_subjob.wait_on_done() # If fragment_length is not given, calculate the fragment_length # using crosscorrelation. Else use the overridevalue. Set the # pool_xcor_filename to None to accommodate common.frip calls. # Calculate, or set, actually used fragment lengths for different # cases. Set the flag indicating whether the fragment length # was given by the user. if fragment_length is not None: pool_xcor_filename = None fragment_length_used_rep1 = fragment_length fragment_length_used_rep2 = fragment_length fragment_length_used_pool = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename) fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename) fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) Nt = common.count_lines(reps_peaks_filename) logger.info("%d peaks from true replicates (Nt)" % (Nt)) N1 = common.count_lines(r1pr_peaks_filename) logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1)) N2 = common.count_lines(r2pr_peaks_filename) logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2)) Np = common.count_lines(pooledpr_peaks_filename) logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np)) # generate the conservative set, which is always based on the IDR peaks # from true replicates conservative_set_filename = \ '%s_final_conservative.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) Ncb = common.count_lines(conservative_set_filename) logger.info( "%d peaks blacklisted from the conservative set" % (Nt-Ncb)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (reps_peaks_filename, conservative_set_filename))) Ncb = Nt logger.info("No blacklist filter applied to the conservative set") # generate the optimal set, which is based on the longest of IDR peaks # list from true reps or the IDR peaks from the pseudoreplicates of the # pool if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) Nob = common.count_lines(optimal_set_filename) logger.info("%d peaks blacklisted from the optimal set" % (No-Nob)) else: subprocess.check_output(shlex.split( 'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename))) Nob = No logger.info("No blacklist filter applied to the optimal set") rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' # FRiP (fraction reads in peaks) # rep1 stable peaks comparing internal pseudoreplicates rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip( rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename, chrom_sizes_filename, fragment_length) # rep2 stable peaks comparing internal pseudoreplicates rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip( rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename, chrom_sizes_filename, fragment_length) # comparing true reps true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, reps_peaks_filename, chrom_sizes_filename, fragment_length) # comparing pooled pseudoreplicates pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip( pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename, chrom_sizes_filename, fragment_length) output = { "rep1_frip_nreads" : rep1_n_reads, "rep1_frip_nreads_in_peaks" : rep1_n_reads_in_peaks, "F1" : rep1_frip_score, "rep2_frip_nreads" : rep2_n_reads, "rep2_frip_nreads_in_peaks" : rep2_n_reads_in_peaks, "F2" : rep2_frip_score, "true_frip_nreads" : true_n_reads, "true_frip_nreads_in_peaks" : true_n_reads_in_peaks, "Ft" : true_frip_score, "pr_frip_nreads" : pr_n_reads, "pr_frip_nreads_in_peaks" : pr_n_reads_in_peaks, "Fp" : pr_frip_score, "fragment_length_used_rep1": fragment_length_used_rep1, "fragment_length_used_rep2": fragment_length_used_rep2, "fragment_length_used_pool": fragment_length_used_pool, "fragment_length_given_by_user": fragment_length_given_by_user } # These are optional outputs to see what's being removed by the blacklist if blacklist: output.update({ "pre_bl_conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( reps_peaks_filename))), "pre_bl_optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress( peaks_to_filter_filename)))} ) # bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = \ common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = \ common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = \ dxpy.upload_local_file(conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update( {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) print "Read fragment length: %d" %(fragment_length) #run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz' if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' #install spp subprocess.check_call('ls -l', shell=True) subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball))) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename) print spp_command # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # for line in iter(process.stdout.readline, ''): # sys.stdout.write(line) subprocess.check_call(shlex.split(spp_command)) #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation #this changes any such coodinates to decimal notation #this assumes 10-column output and that the 2nd and 3rd columns are coordinates #slopBed adjusts feature end coordinates that go off the end of the chromosome #bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" %(final_peaks_filename), "tee %s" %(peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename), 'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename) ]) #These lines transfer the peaks files to the temporary workspace for debugging later #Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" %(n_spp_peaks) print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)) print "First 50 peaks" print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, chrom_sizes, genomesize, narrowpeak_as, gappedpeak_as, broadpeak_as, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, fragment_length=None): rep1_ta_file = dxpy.DXFile(rep1_ta) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) rep1_ta_filename = rep1_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info("No rep2 tags specified so processing as a simplicate experiment.") else: logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.") if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = dxpy.DXFile(rep2_ta) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) rep2_ta_filename = rep2_ta_file.name ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) ctl1_ta_filename = ctl1_ta_file.name if not unary_control: ctl2_ta_file = dxpy.DXFile(ctl2_ta) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) ctl2_ta_filename = ctl2_ta_file.name else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file.name ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend( [(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, name='Pool cross-correlation') if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info("Using one control for both replicate 1 and 2 and for the pool.") rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = pool_applet.run( {"inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls"}, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1/ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info("Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1.") rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info("Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2.") rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info( "Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta}) if not simplicate_experiment: rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta}) pool_pr1_subjob = pool_applet.run( {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1")], "prefix": 'PPR1'}) pool_pr2_subjob = pool_applet.run( {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2")], "prefix": 'PPR2'}) common_args = { 'chrom_sizes': chrom_sizes, 'genomesize': genomesize, 'narrowpeak_as': narrowpeak_as, 'gappedpeak_as': gappedpeak_as, 'broadpeak_as': broadpeak_as } # if the fragment_length argument is given, update macs2 input if fragment_length is not None: common_args.update({'fragment_length' : fragment_length}) common_args.update({'prefix': 'r1'}) rep1_peaks_subjob = macs2( rep1_ta, rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r1pr1'}) rep1pr1_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r1pr2'}) rep1pr2_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, **common_args) if not simplicate_experiment: common_args.update({'prefix': 'r2'}) rep2_peaks_subjob = macs2( rep2_ta, rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'r2pr1'}) rep2pr1_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'r2pr2'}) rep2pr2_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'pool'}) pooled_peaks_subjob = macs2( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr1'}) pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr2'}) pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) output = { 'rep1_narrowpeaks': rep1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1_gappedpeaks': rep1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1_broadpeaks': rep1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1_narrowpeaks_bb': rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep1_gappedpeaks_bb': rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep1_broadpeaks_bb': rep1_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep1_fc_signal': rep1_peaks_subjob.get_output_ref("fc_signal"), 'rep1_pvalue_signal': rep1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get_output_ref("pvalue_signal") } if not simplicate_experiment: output.update({ 'rep2_narrowpeaks': rep2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2_gappedpeaks': rep2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2_broadpeaks': rep2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2_narrowpeaks_bb': rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep2_gappedpeaks_bb': rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep2_broadpeaks_bb': rep2_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep2_fc_signal': rep2_peaks_subjob.get_output_ref("fc_signal"), 'rep2_pvalue_signal': rep2_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr1_narrowpeaks': rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr1_gappedpeaks': rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr1_broadpeaks': rep2pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr1_fc_signal': rep2pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr1_pvalue_signal': rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr2_narrowpeaks': rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr2_gappedpeaks': rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr2_broadpeaks': rep2pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr2_fc_signal': rep2pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr2_pvalue_signal': rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooled_narrowpeaks': pooled_peaks_subjob.get_output_ref("narrowpeaks"), 'pooled_gappedpeaks': pooled_peaks_subjob.get_output_ref("gappedpeaks"), 'pooled_broadpeaks': pooled_peaks_subjob.get_output_ref("broadpeaks"), 'pooled_narrowpeaks_bb': pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'pooled_gappedpeaks_bb': pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'pooled_broadpeaks_bb': pooled_peaks_subjob.get_output_ref("broadpeaks_bb"), 'pooled_fc_signal': pooled_peaks_subjob.get_output_ref("fc_signal"), 'pooled_pvalue_signal': pooled_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr1_narrowpeaks': pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr1_gappedpeaks': pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr1_broadpeaks': pooledpr1_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr1_fc_signal': pooledpr1_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr1_pvalue_signal': pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr2_narrowpeaks': pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr2_gappedpeaks': pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr2_broadpeaks': pooledpr2_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr2_fc_signal': pooledpr2_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr2_pvalue_signal': pooledpr2_peaks_subjob.get_output_ref("pvalue_signal") }) return output
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor, npeaks, nodups, rep1_paired_end, rep2_paired_end, chrom_sizes, as_file=None, idr_peaks=False): if not rep1_paired_end == rep2_paired_end: raise ValueError('Mixed PE/SE not supported (yet)') paired_end = rep1_paired_end rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) unary_control = ctl1_ta == ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) ctl2_ta_file = dxpy.DXFile(ctl2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name) rep1_ta_filename = rep1_ta_file.name rep2_ta_filename = rep2_ta_file.name ctl1_ta_filename = ctl1_ta_file.name ctl2_ta_filename = ctl2_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) ntags_rep2 = common.count_lines(rep2_ta_filename) ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename), (ntags_rep2, 'replicate 2', rep2_ta_filename), (ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]: print("Found %d tags in %s file %s" % (n, name, filename)) print( subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta]}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, name='Pool cross-correlation') rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" if unary_control: print( "Only one control supplied. Using it for both replicate 1 and 2 and for the pool." ) control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = \ pool_applet.run( {"inputs": [ctl1_ta, ctl2_ta]}, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of # rep to control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: print( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: if ntags_ctl1 < ntags_rep1: print( "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." ) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif ntags_ctl2 < ntags_rep2: print( "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." ) rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: print("Using distinct controls for replicate 1 and 2.") rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" rep1_peaks_subjob = spp(rep1_ta, rep1_control, rep1_xcor, chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file, name='Rep1 peaks vs %s' % (rep1_ctl_msg)) rep2_peaks_subjob = spp(rep2_ta, rep2_control, rep2_xcor, chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file, name='Rep2 peaks vs %s' % (rep2_ctl_msg)) pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file, name='Pooled peaks vs %s' % (pool_ctl_msg)) output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") } if idr_peaks: # also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = \ dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep1_ta}, name='Pseudoreplicate rep1 -> R1PR1,2') rep2_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep2_ta}, name='Pseudoreplicate rep2 -> R2PR1,2') pool_pr1_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ] }, name='Pool R1PR1+R2PR1 -> PPR1') pool_pr2_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ] }, name='Pool R1PR2+R2PR2 -> PPR2') rep1_pr1_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end, name='R1PR1 cross-correlation') rep1_pr2_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end, name='R1PR2 cross-correlation') rep2_pr1_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end, name='R2PR1 cross-correlation') rep2_pr2_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end, name='R2PR2 cross-correlation') pool_pr1_xcor_subjob = xcor_only( pool_pr1_subjob.get_output_ref("pooled"), paired_end, name='PPR1 cross-correlation') pool_pr2_xcor_subjob = xcor_only( pool_pr2_subjob.get_output_ref("pooled"), paired_end, name='PPR2 cross-correlation') rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='R1PR1 peaks vs %s' % (rep1_ctl_msg)) rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='R1PR2 peaks vs %s' % (rep1_ctl_msg)) rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='R2PR1 peaks vs %s' % (rep2_ctl_msg)) rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='R2PR2 peaks vs %s' % (rep2_ctl_msg)) pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='PPR1 peaks vs %s' % (pool_ctl_msg)) pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False, name='PPR2 peaks vs %s' % (pool_ctl_msg)) output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), 'rep1pr1_xcor_plot': rep1pr1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1pr1_xcor_scores': rep1pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks"), 'rep1pr2_xcor_plot': rep1pr2_peaks_subjob.get_output_ref("xcor_plot"), 'rep1pr2_xcor_scores': rep1pr2_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), 'rep2pr1_xcor_plot': rep2pr1_peaks_subjob.get_output_ref("xcor_plot"), 'rep2pr1_xcor_scores': rep2pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), 'rep2pr2_xcor_plot': rep2pr2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2pr2_xcor_scores': rep2pr2_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), 'pooledpr1_xcor_plot': pooledpr1_peaks_subjob.get_output_ref("xcor_plot"), 'pooledpr1_xcor_scores': pooledpr1_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), 'pooledpr2_xcor_plot': pooledpr2_peaks_subjob.get_output_ref("xcor_plot"), 'pooledpr2_xcor_scores': pooledpr2_peaks_subjob.get_output_ref("xcor_scores"), }) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" % (Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" % (N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" % (N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" % (Np) conservative_set_filename = '%s_final_conservative.narrowPeak' % ( experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" % (Nt - Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" % (No - Nob) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file( conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility }) logging.info("Exiting with output: %s", output) return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print( "%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe([ 'cat %s' % (overlap_tr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, chrom_sizes, genomesize, narrowpeak_as, gappedpeak_as, broadpeak_as, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, fragment_length=None): rep1_ta_file = dxpy.DXFile(rep1_ta) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) rep1_ta_filename = rep1_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info( "No rep2 tags specified so processing as a simplicate experiment.") else: logger.info( "Rep1 and rep2 tags specified so processing as a replicated experiment." ) if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = dxpy.DXFile(rep2_ta) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) rep2_ta_filename = rep2_ta_file.name ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) ctl1_ta_filename = ctl1_ta_file.name if not unary_control: ctl2_ta_file = dxpy.DXFile(ctl2_ta) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) ctl2_ta_filename = ctl2_ta_file.name else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file.name ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, name='Pool cross-correlation') if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info( "Using one control for both replicate 1 and 2 and for the pool." ) rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = pool_applet.run( { "inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls" }, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info( "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." ) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info( "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." ) rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info("Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta}) if not simplicate_experiment: rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta}) pool_pr1_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ], "prefix": 'PPR1' }) pool_pr2_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ], "prefix": 'PPR2' }) common_args = { 'chrom_sizes': chrom_sizes, 'genomesize': genomesize, 'narrowpeak_as': narrowpeak_as, 'gappedpeak_as': gappedpeak_as, 'broadpeak_as': broadpeak_as } # if the fragment_length argument is given, update macs2 input if fragment_length is not None: common_args.update({'fragment_length': fragment_length}) common_args.update({'prefix': 'r1'}) rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r1pr1'}) rep1pr1_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r1pr2'}) rep1pr2_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, **common_args) if not simplicate_experiment: common_args.update({'prefix': 'r2'}) rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'r2pr1'}) rep2pr1_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'r2pr2'}) rep2pr2_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'pool'}) pooled_peaks_subjob = macs2( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr1'}) pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr2'}) pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) output = { 'rep1_narrowpeaks': rep1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1_gappedpeaks': rep1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1_broadpeaks': rep1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1_narrowpeaks_bb': rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep1_gappedpeaks_bb': rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep1_broadpeaks_bb': rep1_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep1_fc_signal': rep1_peaks_subjob.get_output_ref("fc_signal"), 'rep1_pvalue_signal': rep1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get_output_ref("pvalue_signal") } if not simplicate_experiment: output.update({ 'rep2_narrowpeaks': rep2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2_gappedpeaks': rep2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2_broadpeaks': rep2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2_narrowpeaks_bb': rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep2_gappedpeaks_bb': rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep2_broadpeaks_bb': rep2_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep2_fc_signal': rep2_peaks_subjob.get_output_ref("fc_signal"), 'rep2_pvalue_signal': rep2_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr1_narrowpeaks': rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr1_gappedpeaks': rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr1_broadpeaks': rep2pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr1_fc_signal': rep2pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr1_pvalue_signal': rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr2_narrowpeaks': rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr2_gappedpeaks': rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr2_broadpeaks': rep2pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr2_fc_signal': rep2pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr2_pvalue_signal': rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooled_narrowpeaks': pooled_peaks_subjob.get_output_ref("narrowpeaks"), 'pooled_gappedpeaks': pooled_peaks_subjob.get_output_ref("gappedpeaks"), 'pooled_broadpeaks': pooled_peaks_subjob.get_output_ref("broadpeaks"), 'pooled_narrowpeaks_bb': pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'pooled_gappedpeaks_bb': pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'pooled_broadpeaks_bb': pooled_peaks_subjob.get_output_ref("broadpeaks_bb"), 'pooled_fc_signal': pooled_peaks_subjob.get_output_ref("fc_signal"), 'pooled_pvalue_signal': pooled_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr1_narrowpeaks': pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr1_gappedpeaks': pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr1_broadpeaks': pooledpr1_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr1_fc_signal': pooledpr1_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr1_pvalue_signal': pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr2_narrowpeaks': pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr2_gappedpeaks': pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr2_broadpeaks': pooledpr2_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr2_fc_signal': pooledpr2_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr2_pvalue_signal': pooledpr2_peaks_subjob.get_output_ref("pvalue_signal") }) return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, npeaks, nodups, chrom_sizes, spp_version, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, as_file=None, idr_peaks=False, fragment_length=None, spp_instance=None): rep1_ta_file = dxpy.DXFile(rep1_ta) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) rep1_ta_filename = rep1_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info("No rep2 tags specified so processing as a simplicate experiment.") else: logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.") if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = dxpy.DXFile(rep2_ta) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) rep2_ta_filename = rep2_ta_file.name ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) ctl1_ta_filename = ctl1_ta_file.name if not unary_control: ctl2_ta_file = dxpy.DXFile(ctl2_ta) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) ctl2_ta_filename = ctl2_ta_file.name else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file.name ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend( [(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, spp_version, name='Pool cross-correlation') if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info("Using one control for both replicate 1 and 2 and for the pool.") rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = pool_applet.run( {"inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls"}, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1/ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info("Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1.") rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info("Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2.") rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info( "Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" common_args = { 'chrom_sizes': chrom_sizes, 'spp_version': spp_version, 'as_file': as_file, 'spp_instance': spp_instance } if fragment_length is not None: common_args.update({'fragment_length': fragment_length}) rep1_peaks_subjob = spp( rep1_ta, rep1_control, rep1_xcor, bigbed=True, name='Rep1 peaks vs %s' % (rep1_ctl_msg), prefix='R1', **common_args) if not simplicate_experiment: rep2_peaks_subjob = spp( rep2_ta, rep2_control, rep2_xcor, bigbed=True, name='Rep2 peaks vs %s' % (rep2_ctl_msg), prefix='R2', **common_args) pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=True, name='Pooled peaks vs %s' % (pool_ctl_msg), prefix='PL', **common_args) output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores") } if not simplicate_experiment: output.update({ 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") }) if idr_peaks: # also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = \ dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep1_ta, "prefix": 'R1PR'}, name='Pseudoreplicate rep1 -> R1PR1,2') rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, bigbed=False, name='R1PR1 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR1', **common_args) rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, bigbed=False, name='R1PR2 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR2', **common_args) output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks") }) if not simplicate_experiment: rep2_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep2_ta, "prefix": 'R2PR'}, name='Pseudoreplicate rep2 -> R2PR1,2') pool_pr1_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1")], "prefix": 'PPR1'}, name='Pool R1PR1+R2PR1 -> PPR1') pool_pr2_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2")], "prefix": 'PPR2'}, name='Pool R1PR2+R2PR2 -> PPR2') rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, bigbed=False, name='R2PR1 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR1', **common_args) rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, bigbed=False, name='R2PR2 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR2', **common_args) pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR1 peaks vs %s' % (pool_ctl_msg), prefix='PPR1', **common_args) pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR2 peaks vs %s' % (pool_ctl_msg), prefix='PPR2', **common_args) output.update({ 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), }) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile( xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info(subprocess.check_output( 'ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) logger.info("Read fragment length: %d" % (fragment_length)) spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output( 'head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor, rep1_paired_end, rep2_paired_end, chrom_sizes, genomesize, narrowpeak_as, gappedpeak_as, broadpeak_as): assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' paired_end = rep1_paired_end rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) unary_control = ctl1_ta == ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) ctl2_ta_file = dxpy.DXFile(ctl2_ta) # not necessary to actually download these - just pass through # rep1_xcor_file = dxpy.DXFile(rep1_xcor) # rep2_xcor_file = dxpy.DXFile(rep2_xcor) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) # not necessary to actually download these - just pass through # dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name) # dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name) rep1_ta_filename = rep1_ta_file.name rep2_ta_filename = rep2_ta_file.name ctl1_ta_filename = ctl1_ta_file.name ctl2_ta_filename = ctl2_ta_file.name # not necessary to actually download these - just pass through # rep1_xcor_filename = rep1_xcor_file.name # rep2_xcor_filename = rep2_xcor_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) ntags_rep2 = common.count_lines(rep2_ta_filename) ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename), (ntags_rep2, 'replicate 2', rep2_ta_filename), (ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = pool_applet.run({ "inputs": [rep1_ta, rep2_ta], "prefix": "PL_reps"}) pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. if unary_control: logger.info("Only one control supplied. Using it for both replicate 1 and 2 and for the pool.") control_for_pool = rep1_control else: pool_controls_subjob = pool_applet.run({ "inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls"}) pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1/ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info("Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1.") rep1_control = pooled_controls elif ntags_ctl2 < ntags_rep2: logger.info("Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2.") rep2_control = pooled_controls else: logger.info( "Using distinct controls for replicate 1 and 2.") pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta}) rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta}) pool_pr1_subjob = pool_applet.run( {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1")], "prefix": 'PPR1'}) pool_pr2_subjob = pool_applet.run( {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2")], "prefix": 'PPR2'}) pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end) # no longer calculated - now we take the cross-correlation metrics for the # pseudoreplicates as those from the true reps # rep1_pr1_xcor_subjob = \ # xcor_only(rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) # rep1_pr2_xcor_subjob = \ # xcor_only(rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) # rep2_pr1_xcor_subjob = \ # xcor_only(rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) # rep2_pr2_xcor_subjob = \ # xcor_only(rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) # pool_pr1_xcor_subjob = \ # xcor_only(pool_pr1_subjob.get_output_ref("pooled"), paired_end) # pool_pr2_xcor_subjob = \ # xcor_only(pool_pr2_subjob.get_output_ref("pooled"), paired_end) common_args = { 'chrom_sizes': chrom_sizes, 'genomesize': genomesize, 'narrowpeak_as': narrowpeak_as, 'gappedpeak_as': gappedpeak_as, 'broadpeak_as': broadpeak_as } common_args.update({'prefix': 'r1'}) rep1_peaks_subjob = macs2( rep1_ta, rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r2'}) rep2_peaks_subjob = macs2( rep2_ta, rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'pool'}) pooled_peaks_subjob = macs2( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'r1pr1'}) rep1pr1_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r1pr2'}) rep1pr2_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r2pr1'}) rep2pr1_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'r2pr2'}) rep2pr2_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'ppr1'}) pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr2'}) pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) output = { 'rep1_narrowpeaks': rep1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1_gappedpeaks': rep1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1_broadpeaks': rep1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1_narrowpeaks_bb': rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep1_gappedpeaks_bb': rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep1_broadpeaks_bb': rep1_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep1_fc_signal': rep1_peaks_subjob.get_output_ref("fc_signal"), 'rep1_pvalue_signal': rep1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2_narrowpeaks': rep2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2_gappedpeaks': rep2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2_broadpeaks': rep2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2_narrowpeaks_bb': rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep2_gappedpeaks_bb': rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep2_broadpeaks_bb': rep2_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep2_fc_signal': rep2_peaks_subjob.get_output_ref("fc_signal"), 'rep2_pvalue_signal': rep2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooled_narrowpeaks': pooled_peaks_subjob.get_output_ref("narrowpeaks"), 'pooled_gappedpeaks': pooled_peaks_subjob.get_output_ref("gappedpeaks"), 'pooled_broadpeaks': pooled_peaks_subjob.get_output_ref("broadpeaks"), 'pooled_narrowpeaks_bb': pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'pooled_gappedpeaks_bb': pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'pooled_broadpeaks_bb': pooled_peaks_subjob.get_output_ref("broadpeaks_bb"), 'pooled_fc_signal': pooled_peaks_subjob.get_output_ref("fc_signal"), 'pooled_pvalue_signal': pooled_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr1_narrowpeaks': rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr1_gappedpeaks': rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr1_broadpeaks': rep2pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr1_fc_signal': rep2pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr1_pvalue_signal': rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr2_narrowpeaks': rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr2_gappedpeaks': rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr2_broadpeaks': rep2pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr2_fc_signal': rep2pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr2_pvalue_signal': rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr1_narrowpeaks': pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr1_gappedpeaks': pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr1_broadpeaks': pooledpr1_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr1_fc_signal': pooledpr1_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr1_pvalue_signal': pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr2_narrowpeaks': pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr2_gappedpeaks': pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr2_broadpeaks': pooledpr2_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr2_fc_signal': pooledpr2_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr2_pvalue_signal': pooledpr2_peaks_subjob.get_output_ref("pvalue_signal") } return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' %(blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' %(reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" %(Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" %(N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" %(N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" %(Np) conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" %(Nt-Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" %(No-Nob) rescue_ratio = float(max(Np,Nt)) / float(min(Np,Nt)) self_consistency_ratio = float(max(N1,N2)) / float(min(N1,N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename) output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility, "No": Nob, "Nc": Ncb }) logging.info("Exiting with output: %s", output) return output
def process(self): ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if self.peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif self.peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif self.peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: print "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." sys.exit() # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn), 'sort -u' ], self.overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn) ], self.rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn)) self.npeaks_in = common.count_lines(common.uncompress(self.pooled_peaks_fn)) self.npeaks_out = common.count_lines(self.overlapping_peaks_fn) self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn) #make bigBed files for visualization self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type) self.rejected_peaks_bb_fn = common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor, npeaks, nodups, rep1_paired_end, rep2_paired_end, chrom_sizes, spp_version, as_file=None, idr_peaks=False): assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported (yet)' paired_end = rep1_paired_end rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) unary_control = ctl1_ta == ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) ctl2_ta_file = dxpy.DXFile(ctl2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name) rep1_ta_filename = rep1_ta_file.name rep2_ta_filename = rep2_ta_file.name ctl1_ta_filename = ctl1_ta_file.name ctl2_ta_filename = ctl2_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) ntags_rep2 = common.count_lines(rep2_ta_filename) ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) for n, name, filename in [ (ntags_rep1, 'replicate 1', rep1_ta_filename), (ntags_rep2, 'replicate 2', rep2_ta_filename), (ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_call('ls -l', shell=True) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, spp_version, name='Pool cross-correlation') rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" if unary_control: logger.info("Only one control supplied. Using it for both replicate 1 and 2 and for the pool.") control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = \ pool_applet.run( {"inputs": [ctl1_ta, ctl2_ta], "prefix": 'pooled_ctls'}, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of # rep to control reads ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1/ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: if ntags_ctl1 < ntags_rep1: logger.info("Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1.") rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif ntags_ctl2 < ntags_rep2: logger.info("Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2.") rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info("Using distinct controls for replicate 1 and 2.") rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" rep1_peaks_subjob = spp( rep1_ta, rep1_control, rep1_xcor, chrom_sizes, spp_version, bigbed=True, as_file=as_file, name='Rep1 peaks vs %s' % (rep1_ctl_msg), prefix='R1') rep2_peaks_subjob = spp( rep2_ta, rep2_control, rep2_xcor, chrom_sizes, spp_version, bigbed=True, as_file=as_file, name='Rep2 peaks vs %s' % (rep2_ctl_msg), prefix='R2') pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes, spp_version, bigbed=True, as_file=as_file, name='Pooled peaks vs %s' % (pool_ctl_msg), prefix='PL') output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") } if idr_peaks: # also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = \ dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep1_ta, "prefix": 'R1PR'}, name='Pseudoreplicate rep1 -> R1PR1,2') rep2_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep2_ta, "prefix": 'R2PR'}, name='Pseudoreplicate rep2 -> R2PR1,2') pool_pr1_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1")], "prefix": 'PPR1'}, name='Pool R1PR1+R2PR1 -> PPR1') pool_pr2_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2")], "prefix": 'PPR2'}, name='Pool R1PR2+R2PR2 -> PPR2') # rep1_pr1_xcor_subjob = \ # xcor_only( # rep1_pr_subjob.get_output_ref("pseudoreplicate1"), # paired_end, # spp_version, # name='R1PR1 cross-correlation') # rep1_pr2_xcor_subjob = \ # xcor_only( # rep1_pr_subjob.get_output_ref("pseudoreplicate2"), # paired_end, # spp_version, # name='R1PR2 cross-correlation') # rep2_pr1_xcor_subjob = \ # xcor_only( # rep2_pr_subjob.get_output_ref("pseudoreplicate1"), # paired_end, # spp_version, # name='R2PR1 cross-correlation') # rep2_pr2_xcor_subjob = \ # xcor_only( # rep2_pr_subjob.get_output_ref("pseudoreplicate2"), # paired_end, # spp_version, # name='R2PR2 cross-correlation') # pool_pr1_xcor_subjob = \ # xcor_only( # pool_pr1_subjob.get_output_ref("pooled"), # paired_end, # spp_version, # name='PPR1 cross-correlation') # pool_pr2_xcor_subjob = \ # xcor_only( # pool_pr2_subjob.get_output_ref("pooled"), # paired_end, # spp_version, # name='PPR2 cross-correlation') rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, chrom_sizes, spp_version, bigbed=False, name='R1PR1 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR1') rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, chrom_sizes, spp_version, bigbed=False, name='R1PR2 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR2') rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, chrom_sizes, spp_version, bigbed=False, name='R2PR1 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR1') rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, chrom_sizes, spp_version, bigbed=False, name='R2PR2 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR2') pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes, spp_version, bigbed=False, name='PPR1 peaks vs %s' % (pool_ctl_msg), prefix='PPR1') pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes, spp_version, bigbed=False, name='PPR2 peaks vs %s' % (pool_ctl_msg), prefix='PPR2') output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), # 'rep1pr1_xcor_plot': rep1pr1_peaks_subjob.get_output_ref("xcor_plot"), # 'rep1pr1_xcor_scores': rep1pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks"), # 'rep1pr2_xcor_plot': rep1pr2_peaks_subjob.get_output_ref("xcor_plot"), # 'rep1pr2_xcor_scores': rep1pr2_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), # 'rep2pr1_xcor_plot': rep2pr1_peaks_subjob.get_output_ref("xcor_plot"), # 'rep2pr1_xcor_scores': rep2pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), # 'rep2pr2_xcor_plot': rep2pr2_peaks_subjob.get_output_ref("xcor_plot"), # 'rep2pr2_xcor_scores': rep2pr2_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), # 'pooledpr1_xcor_plot': pooledpr1_peaks_subjob.get_output_ref("xcor_plot"), # 'pooledpr1_xcor_scores': pooledpr1_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), # 'pooledpr2_xcor_plot': pooledpr2_peaks_subjob.get_output_ref("xcor_plot"), # 'pooledpr2_xcor_scores': pooledpr2_peaks_subjob.get_output_ref("xcor_scores"), }) return output