def scrub_fun(in_filepath, out_filepath): # Check the input. logger.debug("Input flagstat for %s" % (in_filepath)) logger.debug(shell_command("samtools flagstat %s" % (in_filepath))) # Set up the paths to inputs and outputs. dirname = os.path.dirname(out_filepath) header_path = os.path.join(dirname, "header.txt") sam_path = os.path.join(dirname, "scrubbed.sam") # Cache the header. shell_command("samtools view -H %s -o %s" % (in_filepath, header_path)) # Scrub the sequence information from these fields: # 6 = CIGAR, 10 = query sequence, 11 = PHRED, and suppress optional tags # For example, unscrubbed read might look like: # SPADE:8:33:220:1107#0 0 chr21 8994907 37 9M1I26M * 0 0 ATTGTTGACAAAAACTCGACAAACAATTGGAGAATC bbbR]`T`^]TTSSS^_W`BBBBBBBBBBBBBBBBB X0:i:1 X1:i:0 MD:Z:35 PG:Z:MarkDuplicates XG:i:1 NM:i:1 XM:i:0 XO:i:1 XT:A:U # Scrubbed version would look like: # SPADE:8:33:220:1107#0 0 chr21 8994907 37 36M * 0 0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN * common.run_pipe([ 'samtools view %s' % (in_filepath), r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $6=(i-1 "M"); $10=s; $11="*"; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11}'""" ], sam_path) # Add back the header. common.run_pipe([ 'cat %s %s' % (header_path, sam_path), 'samtools view -S -b - -o %s' % (out_filepath) ]) # Check the output. logger.debug("Output flagstat for %s" % (out_filepath)) logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
def scrub(in_filepath, out_filepath): # Check the input. logger.debug("Input flagstat for %s" % (in_filepath)) logger.debug(shell_command("samtools flagstat %s" % (in_filepath))) # Set up the paths to inputs and outputs. dirname = os.path.dirname(out_filepath) header_path = os.path.join(dirname, "header.txt") sam_path = os.path.join(dirname, "scrubbed.sam") # Cache the header. shell_command("samtools view -H %s -o %s" % (in_filepath, header_path)) # Scrub the sequence information from these fields: # 6 = CIGAR, 10 = query sequence, 11 = PHRED, and suppress optional tags # For example, unscrubbed read might look like: # SPADE:8:33:220:1107#0 0 chr21 8994907 37 9M1I26M * 0 0 ATTGTTGACAAAAACTCGACAAACAATTGGAGAATC bbbR]`T`^]TTSSS^_W`BBBBBBBBBBBBBBBBB X0:i:1 X1:i:0 MD:Z:35 PG:Z:MarkDuplicates XG:i:1 NM:i:1 XM:i:0 XO:i:1 XT:A:U # Scrubbed version would look like: # SPADE:8:33:220:1107#0 0 chr21 8994907 37 36M * 0 0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN * common.run_pipe([ 'samtools view %s' % (in_filepath), r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $6=(i-1 "M"); $10=s; $11="*"; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11}'""" ], sam_path) # Add back the header. common.run_pipe([ 'cat %s %s' % (header_path, sam_path), 'samtools view -S -b - -o %s' % (out_filepath)]) # Check the output. logger.debug("Output flagstat for %s" % (out_filepath)) logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
def main(input_bam, paired_end): input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' subprocess.check_output('ls -l', shell=True) # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) subprocess.check_output('ls -l', shell=True) # ================ # Create BEDPE file # ================ if paired_end: final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" command = \ "samtools sort -@ %d -n %s %s" \ % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix) logger.info(command) subprocess.check_call(shlex.split(command)) final_BEDPE_filename = input_bam_basename + ".bedpe.gz" out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) subprocess.check_output('ls -l', shell=True) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) output = {} output["tagAlign_file"] = dxpy.dxlink(tagAlign_file) if paired_end: output["BEDPE_file"] = dxpy.dxlink(BEDPE_file) return output
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None): return_string = \ "\t\ttrack %s%d\n" %(accession,n) + \ "\t\tbigDataUrl %s\n" %(url) + \ "\t\tshortLabel %s\n" %(name[:17]) + \ "\t\tparent %sviewpeaks on\n" %(accession) + \ "\t\ttype %s\n" %(tracktype) + \ "\t\tvisibility dense\n" + \ "\t\tview PK\n" + \ "\t\tpriority %d\n\n" %(n) n_stanzas = 1 if not lowpass: lowpass = [] if isinstance(lowpass,int): lowpass = [lowpass] extra_stanza_count = 0 for (i, cutoff) in enumerate(lowpass,start=1): fn = dx.get_id() if not os.path.isfile(fn): dxpy.download_dxfile(dx.get_id(),fn) cutoffstr = '-lt%d' %(cutoff) outfn = fn + cutoffstr print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0] bed_fn = fn + '.bed' common.block_on('bigBedToBed %s %s' %(fn, bed_fn)) common.run_pipe([ 'cat %s' %(bed_fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn) print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0] if tracktype =='bigBed 6 +': as_file = 'narrowPeak.as' elif tracktype == 'bigBed 12 +': as_file = 'gappedPeak.as' else: print "Cannot match tracktype %s to any .as file" %(tracktype) bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file) newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True) new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True) new_lines = [ "\t\ttrack %s%d" %(accession,n+i), "\t\tbigDataUrl %s" %(new_url), "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr), "\t\tparent %sviewpeaks on" %(accession), "\t\ttype %s" %(tracktype), "\t\tvisibility dense", "\t\tview PK", "\t\tpriority %d\n\n" %(n+i)] new_stanza = '\n'.join(new_lines) return_string += new_stanza n_stanzas += 1 os.remove(bed_fn) os.remove(bb_fn) os.remove(outfn) os.remove(fn) return(return_string, n_stanzas)
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None): return_string = \ "\t\ttrack %s%d\n" %(accession,n) + \ "\t\tbigDataUrl %s\n" %(url) + \ "\t\tshortLabel %s\n" %(name[:17]) + \ "\t\tparent %sviewpeaks on\n" %(accession) + \ "\t\ttype %s\n" %(tracktype) + \ "\t\tvisibility dense\n" + \ "\t\tview PK\n" + \ "\t\tpriority %d\n\n" %(n) n_stanzas = 1 if not lowpass: lowpass = [] if isinstance(lowpass,int): lowpass = [lowpass] extra_stanza_count = 0 for (i, cutoff) in enumerate(lowpass,start=1): fn = dx.get_id() if not os.path.isfile(fn): dxpy.download_dxfile(dx.get_id(),fn) cutoffstr = '-lt%d' %(cutoff) outfn = fn + cutoffstr print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0] bed_fn = fn + '.bed' common.block_on('bigBedToBed %s %s' %(fn, bed_fn)) common.run_pipe([ 'cat %s' %(bed_fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn) print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0] if tracktype =='bigBed 6 +': as_file = 'narrowPeak.as' elif tracktype == 'bigBed 12 +': as_file = 'gappedPeak.as' else: print "Cannot match tracktype %s to any .as file" %(tracktype) bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file) newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True) new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True) new_lines = [ "\t\ttrack %s%dp%d" %(accession,n,i), "\t\tbigDataUrl %s" %(new_url), "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr), "\t\tparent %sviewpeaks on" %(accession), "\t\ttype %s" %(tracktype), "\t\tvisibility dense", "\t\tview PK", "\t\tpriority %d.%d\n\n" %(n,i)] new_stanza = '\n'.join(new_lines) return_string += new_stanza n_stanzas += 1 return(return_string, n_stanzas)
def count_lines(filename): if filename.endswith(('.Z', '.gz', '.bz', '.bz2')): catcommand = 'gzip -dc' else: catcommand = 'cat' out, err = common.run_pipe(['%s %s' % (catcommand, filename), 'wc -l']) return int(out)
def main(inputs, prefix=None): input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = common.run_pipe([ 'gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) output = { "pooled": dxpy.dxlink(pooled) } return output
def count_lines(filename): if filename.endswith(('.Z','.gz','.bz','.bz2')): catcommand = 'gzip -dc' else: catcommand = 'cat' out,err = common.run_pipe([ '%s %s' %(catcommand, filename), 'wc -l' ]) return int(out)
def process(self, resource_dir): # Define output directory peaks_dirname = "peaks_spp" if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) # Define output filenames prefix = self.experiment.name.rstrip('.gz').rstrip('.tagAlign') self.peaks_fn = prefix + '.regionPeak' self.final_peaks_fn = self.peaks_fn + '.gz' self.xcor_plot_fn = prefix + '.pdf' self.xcor_scores_fn = prefix + '.ccscores' self.fixed_peaks_fn = prefix + '.fixcoord.regionPeak' # fragment length is third column in cross-correlation input file fragment_length = int(open(self.xcor_scores_input.name, 'r').readline().split('\t')[2]) print "Read fragment length: %d" % fragment_length # install SPP ca_tarball = '%s/caTools/caTools_1.17.1.tar.gz' % resource_dir spp_tarball = '%s/phantompeakqualtools/spp_1.10.1.tar.gz' % resource_dir bitops_tarball = '%s/bitops/bitops_1.0-6.tar.gz' % resource_dir run_spp = '%s/phantompeakqualtools/run_spp_nodups.R' % resource_dir if self.nodups else '%s/phantompeakqualtools/run_spp.R' % resource_dir if not os.path.exists(os.path.expanduser("~/R-libs")): os.mkdir(os.path.expanduser("~/R-libs")) print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), bitops_tarball)), stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), ca_tarball)), stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s/snow/snow_0.4-1.tar.gz' % (os.path.expanduser("~/R-libs"), resource_dir)), stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), spp_tarball)), stderr=subprocess.STDOUT) # run SPP spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (run_spp, self.cpu_count(), self.experiment.name, self.control.name, self.npeaks, fragment_length, self.peaks_fn, self.xcor_plot_fn, self.xcor_scores_fn) print spp_command process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) for line in iter(process.stdout.readline, ''): sys.stdout.write(line) # various fixes to ensure that coordinates fall within chr boundaries and are in the correct format common.run_pipe([ "gzip -dc %s" % self.final_peaks_fn, "tee %s" % self.peaks_fn, r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % self.chrom_sizes.name, 'bedClip stdin %s %s' % (self.chrom_sizes.name, self.fixed_peaks_fn) ])
def blacklist_filter(input_fname, output_fname, input_blacklist_fname): with open(input_fname, 'rb') as fh: gzipped = fh.read(2) == b'\x1f\x8b' if gzipped: peaks_fname = 'peaks.bed' out,err = common.run_pipe(['gzip -dc %s' %(input_fname)], peaks_fname) else: peaks_fname = input_fname with open(input_blacklist_fname, 'rb') as fh: gzipped = fh.read(2) == b'\x1f\x8b' if gzipped: blacklist_fname = 'blacklist.bed' out, err = common.run_pipe(['gzip -dc %s' %(input_blacklist_fname)], blacklist_fname) else: blacklist_fname = input_blacklist_fname out, err = common.run_pipe([ 'subtractBed -A -a %s -b %s' %(peaks_fname, blacklist_fname) ], output_fname)
def rescale_scores(fn, scores_col, new_min=10, new_max=1000): n_peaks = common.count_lines(fn) sorted_fn = 'sorted-%s' %(fn) rescaled_fn = 'rescaled-%s' %(fn) out,err = common.run_pipe([ 'sort -k %dgr,%dgr %s' %(scores_col, scores_col, fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], sorted_fn) out, err = common.run_pipe([ 'head -n 1 %s' %(sorted_fn), 'cut -f %s' %(scores_col)]) max_score = float(out.strip()) out, err = common.run_pipe([ 'tail -n 1 %s' %(sorted_fn), 'cut -f %s' %(scores_col)]) min_score = float(out.strip()) out,err = common.run_pipe([ 'cat %s' %(sorted_fn), r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \ r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)], rescaled_fn) return rescaled_fn
def rescale_scores(fn, scores_col, new_min=10, new_max=1000): n_peaks = common.count_lines(fn) sorted_fn = 'sorted-%s' % (fn) rescaled_fn = 'rescaled-%s' % (fn) out, err = common.run_pipe([ 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'""" ], sorted_fn) out, err = common.run_pipe( ['head -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) max_score = float(out.strip()) out, err = common.run_pipe( ['tail -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) min_score = float(out.strip()) out,err = common.run_pipe([ 'cat %s' %(sorted_fn), r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \ r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)], rescaled_fn) return rescaled_fn
def blacklist_filter(input_fname, output_fname, input_blacklist_fname): with open(input_fname, 'rb') as fh: gzipped = fh.read(2) == b'\x1f\x8b' if gzipped: peaks_fname = 'peaks.bed' out, err = common.run_pipe(['gzip -dc %s' % (input_fname)], peaks_fname) else: peaks_fname = input_fname with open(input_blacklist_fname, 'rb') as fh: gzipped = fh.read(2) == b'\x1f\x8b' if gzipped: blacklist_fname = 'blacklist.bed' out, err = common.run_pipe(['gzip -dc %s' % (input_blacklist_fname)], blacklist_fname) else: blacklist_fname = input_blacklist_fname out, err = common.run_pipe( ['subtractBed -A -a %s -b %s' % (peaks_fname, blacklist_fname)], output_fname)
def scrub(in_filepath, out_filepath): # Check the input. logger.debug("Input flagstat for %s" % (in_filepath)) logger.debug(shell_command("samtools flagstat %s" % (in_filepath))) # Set up the paths to inputs and outputs. dirname = os.path.dirname(out_filepath) header_path = os.path.join(dirname, "header.txt") sam_path = os.path.join(dirname, "scrubbed.sam") # Cache the header. shell_command("samtools view -H %s -o %s" % (in_filepath, header_path)) # Scrub the sequence from field 10 with awk. common.run_pipe([ 'samtools view %s' % (in_filepath), r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $10=s; $11="*"; print}'""" ], sam_path) # Add back the header. common.run_pipe([ 'cat %s %s' % (header_path, sam_path), 'samtools view -S -b - -o %s' % (out_filepath) ]) # Check the output. logger.debug("Output flagstat for %s" % (out_filepath)) logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
def pool(inputs, prefix=None): input_filenames = [] for input_file in inputs: dxf = input_file input_filenames.append(dxf) # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = common.run_pipe( ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=pooled_filename) pooled = pooled_filename output = {"pooled": pooled} return output
def pool(inputs, prefix=None): input_filenames = inputs # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) # outfile needs to be reduced to basename to direct cromwell # output to the correct place out, err = common.run_pipe([ 'gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=os.path.basename(pooled_filename)) output = { "pooled": pooled_filename } return output
def main(input_bam, paired_end): input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' subprocess.check_output('ls -l', shell=True) samtools = SAMTOOLS_PATH["1.0"] # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn" ], outfile=final_TA_filename) subprocess.check_output('ls -l', shell=True) # ================ # Create BEDPE file # ================ if paired_end: final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" command = \ "%s sort -@ %d -n %s %s" \ % (samtools, cpu_count(), input_bam_filename, final_nmsrt_bam_prefix) logger.info(command) subprocess.check_call(shlex.split(command)) final_BEDPE_filename = input_bam_basename + ".bedpe.gz" out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn" ], outfile=final_BEDPE_filename) subprocess.check_output('ls -l', shell=True) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) output = {} output["tagAlign_file"] = dxpy.dxlink(tagAlign_file) if paired_end: output["BEDPE_file"] = dxpy.dxlink(BEDPE_file) return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe([ 'cat %s' % (overlap_tr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(input_bam, paired_end, spp_version): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ logger.info( "Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename))) NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) logger.info( "Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename))) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename)]) out, err = common.run_pipe([ r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe([ "mv temp %s" % (CC_scores_filename)]) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": dxpy.dxlink(tagAlign_file), "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } if paired_end: output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)}) return output
def main(input_tags, prefix=None): input_tags_file = dxpy.DXFile(input_tags) input_tags_filename = input_tags_file.name dxpy.download_dxfile(input_tags_file.get_id(), input_tags_filename) # introspect the file to determine tagAlign (thus SE) or BEDPE (thus PE) # strip extension as appropriate subprocess.check_output('ls', shell=True) with gzip.open(input_tags_filename) as f: firstline = f.readline() logger.info('First line of input_tags:\n%s' % (firstline)) se_cols = 6 pe_cols = 10 if re.match('^(\S+[\t\n]){%d}$' % (se_cols), firstline): paired_end = False input_tags_basename = prefix or input_tags_filename.rstrip('.tagAlign.gz') filename_infix = 'SE' logger.info("Detected single-end data") elif re.match('^(\S+[\t\n]){%d}$' % (pe_cols), firstline): paired_end = True input_tags_basename = prefix or input_tags_filename.rstrip('.bedpe.gz') filename_infix = 'PE2SE' logger.info("Detected paired-end data") else: raise IOError( "%s is neither a BEDPE or tagAlign file" % (input_tags_filename)) pr_ta_filenames = \ [input_tags_basename + ".%s.pr1.tagAlign.gz" % (filename_infix), input_tags_filename + ".%s.pr2.tagAlign.gz" % (filename_infix)] # count lines in the file out, err = common.run_pipe([ 'gzip -dc %s' % (input_tags_filename), 'wc -l']) # number of lines in each split nlines = (int(out)+1)/2 # Shuffle and split BEDPE file into 2 equal parts # by using the input to seed shuf we ensure multiple runs with the same # input will produce the same output # Produces two files named splits_prefix0n, n=1,2 splits_prefix = 'temp_split' out, err = common.run_pipe([ 'gzip -dc %s' % (input_tags_filename), 'shuf --random-source=%s' % (input_tags_filename), 'split -a 2 -d -l %d - %s' % (nlines, splits_prefix)]) # Convert read pairs to reads into standard tagAlign file for i, index in enumerate(['00', '01']): # could be made multi-threaded steps = ['cat %s' % (splits_prefix+index)] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=pr_ta_filenames[i]) pseudoreplicate1_file = dxpy.upload_local_file(pr_ta_filenames[0]) pseudoreplicate2_file = dxpy.upload_local_file(pr_ta_filenames[1]) output = { "pseudoreplicate1": dxpy.dxlink(pseudoreplicate1_file), "pseudoreplicate2": dxpy.dxlink(pseudoreplicate2_file) } return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. experiment = dxpy.DXFile(experiment) control = dxpy.DXFile(control) xcor_scores_input = dxpy.DXFile(xcor_scores_input) chrom_sizes = dxpy.DXFile(chrom_sizes) narrowPeak_as = dxpy.DXFile(narrowpeak_as) gappedPeak_as = dxpy.DXFile(gappedpeak_as) broadPeak_as = dxpy.DXFile(broadpeak_as) # Download the file inputs to the local file system # and use their own filenames. dxpy.download_dxfile(experiment.get_id(), experiment.name) dxpy.download_dxfile(control.get_id(), control.name) dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name) dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name) dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name) dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name) #Define the output filenames peaks_dirname = 'peaks' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix = experiment.name if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn) gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn) broadPeak_bb_fn = "%s.bb" % (broadPeak_fn) fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix) #Extract the fragment length estimate from column 3 of the cross-correlation scores file with open(xcor_scores_input.name, 'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] #third column print "Fraglen %s" % (fraglen) #=========================================== # Generate narrow peaks and preliminary signal tracks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (narrowPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # Generate Broad and Gapped Peaks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (broadPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn)) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' % (peaks_dirname, prefix), scores_col=5) pipe = [ 'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (gappedPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_FE.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(fc_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #============================================ # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l']) controlReads = out.strip() sval = str(min(float(chipReads), float(controlReads)) / 1000000) print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads, controlReads, sval) returncode = common.block_on( 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_ppois.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(pvalue_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn), chrom_sizes.name, broadPeak_as.name, bed_type='bed6+3') #Temporary during development to create empty files just to get the applet to exit for fn in [ narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn ]: common.block_on('touch %s' % (fn)) # Upload the file outputs narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn) gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn) broadPeak = dxpy.upload_local_file(broadPeak_gz_fn) narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn) gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn) broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn) fc_signal = dxpy.upload_local_file(fc_signal_fn) pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn) # Build the output structure. output = { "narrowpeaks": dxpy.dxlink(narrowPeak), "gappedpeaks": dxpy.dxlink(gappedPeak), "broadpeaks": dxpy.dxlink(broadPeak), "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb), "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb), "broadpeaks_bb": dxpy.dxlink(broadPeak_bb), "fc_signal": dxpy.dxlink(fc_signal), "pvalue_signal": dxpy.dxlink(pvalue_signal) } return output
def main(input_bam, fastqs, debug): # create a file handler if len(fastqs) > 1: paired_end = True else: paired_end = False handler = logging.FileHandler('xcor.log') if debug: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) logger.addHandler(handler) input_bam_filename = input_bam input_bam_basename = (input_bam.rstrip('.bam')).split('/')[-1] intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn" ], outfile=final_TA_filename) samtools = SAMTOOLS_PATH # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "%s sort -n -@%d -o %s %s" \ % (samtools, cpu_count(), final_nmsrt_bam_filename, input_bam_filename) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn" ], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag # run spp out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (SPP_TOOL_PATH, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) tagAlign_file = final_TA_filename if paired_end: BEDPE_file = final_BEDPE_filename CC_scores_file = CC_scores_filename CC_plot_file = CC_plot_filename xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": tagAlign_file, "CC_scores_file": CC_scores_file, "CC_plot_file": CC_plot_file, "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": int(xcor_qc.get('estFragLen')) } with open('xcor.json', 'w') as f: json.dump(output, f, sort_keys=True, indent=4, separators=(',', ': ')) if paired_end: output.update({"BEDPE_file": BEDPE_file}) return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print( "%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print( "%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb( overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb( rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, "npeaks_rejected" : npeaks_rejected, "frip_nreads" : n_reads, "frip_nreads_in_peaks" : n_reads_in_peaks, "frip_score" : frip_score, "fragment_length_used" : fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def postprocess(crop_length, reference_tar, debug, reads_files): handler = logging.FileHandler('post_mapping.log') if debug: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) logger.addHandler(handler) samtools = SAMTOOLS_PATH bwa = BWA_PATH logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa)) indexed_reads = [] unmapped_reads = [] '''for file_name in special_sort(reads_files): if file_name.endswith('.sai'): indexed_reads.append(file_name) else: unmapped_reads.append(file_name) ''' figure_out_sort(reads_files, unmapped_reads, indexed_reads) indexed_reads_filenames = [] unmapped_reads_filenames = [] for i, reads in enumerate(indexed_reads): read_pair_number = i + 1 logger.info("indexed_reads %d: %s" % (read_pair_number, reads)) indexed_reads_filenames.append(reads) unmapped = unmapped_reads[i] logger.info("unmapped reads %d: %s" % (read_pair_number, unmapped)) unmapped_reads_filenames.append(unmapped) reference_tar_filename = reference_tar logger.info("reference_tar: %s" % (reference_tar_filename)) # extract the reference files from the tar reference_dirname = '.' reference_filename = \ resolve_reference(reference_tar_filename, reference_dirname) logger.info("Using reference file: %s" % (reference_filename)) paired_end = len(indexed_reads) == 2 # fixing the directories if paired_end: r1_basename = (strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS)).split('/')[-1] r2_basename = (strip_extensions(unmapped_reads_filenames[1], STRIP_EXTENSIONS)).split('/')[-1] reads_basename = r1_basename + r2_basename else: reads_basename = (strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS)).split('/')[-1] raw_bam_filename = '%s.raw.srt.bam' % (reads_basename) raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename) if paired_end: reads1_filename = indexed_reads_filenames[0] reads2_filename = indexed_reads_filenames[1] unmapped_reads1_filename = unmapped_reads_filenames[0] unmapped_reads2_filename = unmapped_reads_filenames[1] raw_sam_filename = reads_basename + ".raw.sam" badcigar_filename = "badreads.tmp" steps = [ "%s sampe -P %s %s %s %s %s" % (bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename), "tee %s" % (raw_sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq" ] out, err = common.run_pipe(steps, badcigar_filename) print(out) if err: logger.error("sampe error: %s" % (err)) steps = [ "cat %s" % (raw_sam_filename), "grep -v -F -f %s" % (badcigar_filename) ] else: # single end reads_filename = indexed_reads_filenames[0] unmapped_reads_filename = unmapped_reads_filenames[0] steps = [ "%s samse %s %s %s" % (bwa, reference_filename, reads_filename, unmapped_reads_filename) ] steps.extend([ "%s view -@%d -Su -" % (samtools, cpu_count()), "%s sort -@%d -o %s" % (samtools, cpu_count(), raw_bam_filename) ]) # samtools adds .bam logger.info("Running pipe: %s" % (steps)) out, err = common.run_pipe(steps) if out: print(out) if err: logger.error("samtools error: %s" % (err)) with open(raw_bam_mapstats_filename, 'w') as fh: subprocess.check_call(shlex.split("%s flagstat %s" % (samtools, raw_bam_filename)), stdout=fh) print(subprocess.check_output('ls -l', shell=True)) mapped_reads = raw_bam_filename mapping_statistics = raw_bam_mapstats_filename flagstat_qc = flagstat_parse(raw_bam_mapstats_filename) output = { 'mapped_reads': mapped_reads, 'mapping_statistics': mapping_statistics, 'n_mapped_reads': flagstat_qc.get('mapped')[0], # 0 is hi-q reads "crop_length": crop_length, "paired_end": paired_end } with open('post_mapping.json', 'w') as f: json.dump(output, f, sort_keys=True, indent=4, separators=(',', ': ')) logger.info("Returning from postprocess with output: %s" % (output)) return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, prefix=None, fragment_length=None): narrowPeak_as = narrowpeak_as gappedPeak_as = gappedpeak_as broadPeak_as = broadpeak_as # Define the output filenames peaks_dirname = 'peaks_macs' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) if not prefix: prefix = experiment if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file # if the fragment_length argument is given, use that instead if fragment_length is not None: fraglen = str(fragment_length) logger.info("User given fragment length %s" % fraglen) else: with open(xcor_scores_input, 'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] # third column logger.info("Fraglen %s" % (fraglen)) # =========================================== # Generate narrow peaks and preliminary signal tracks # ============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' % (experiment, control) + \ '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genomesize, fraglen) logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_narrowpeak_fn = common.slop_clip( '%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), chrom_sizes) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 # with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (narrowPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn)) # remove additional files # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed # =========================================== # Generate Broad and Gapped Peaks # ============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' % (experiment, control) + \ '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' % (genomesize, fraglen) logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_broadpeak_fn = common.slop_clip( '%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), chrom_sizes) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending # order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (broadPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn)) # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' % (peaks_dirname, prefix), chrom_sizes, bed_type='gappedPeak') # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5) pipe = [ 'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (gappedPeak_fn), 'gzip -cn' ] out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn)) # remove additional files # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed # =========================================== # For Fold enrichment signal tracks # ============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), # Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' % (peaks_dirname, prefix) + \ '-m FE' logger.info(command) returncode = common.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_FE.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' % (chrom_sizes, peaks_dirname, prefix) ] out, err = common.run_pipe(pipe) # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' % (peaks_dirname, prefix) + \ '%s ' % (chrom_sizes) + \ '%s' % (fc_signal_fn) logger.info(command) returncode = common.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # drm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph # =========================================== # For -log10(p-value) signal tracks # ============================================ # Compute sval = # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l']) controlReads = out.strip() sval = str(min(float(chipReads), float(controlReads)) / 1000000) logger.info("chipReads = %s, controlReads = %s, sval = %s" % (chipReads, controlReads, sval)) returncode = common.block_on('macs2 bdgcmp ' + '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + '--outdir %s -o %s_ppois.bdg ' % (peaks_dirname, prefix) + '-m ppois -S %s' % (sval)) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_ppois.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' % (chrom_sizes, peaks_dirname, prefix) ] out, err = common.run_pipe(pipe) # rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' % (peaks_dirname, prefix) + \ '%s ' % (chrom_sizes) + \ '%s' % (pvalue_signal_fn) logger.info(command) returncode = common.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg # =========================================== # Generate bigWigs from beds to support trackhub visualization of peak files # ============================================ narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn), chrom_sizes, narrowPeak_as, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn), chrom_sizes, gappedPeak_as, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn), chrom_sizes, broadPeak_as, bed_type='bed6+3') # Temporary during development to create empty files just to get the applet # to exit # narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn) # gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn) # broadPeak_bb_fn = "%s.bb" % (broadPeak_fn) output = { "narrowpeaks": narrowPeak_gz_fn, "gappedpeaks": gappedPeak_gz_fn, "broadpeaks": broadPeak_gz_fn, "narrowpeaks_bb": narrowPeak_bb_fname, "gappedpeaks_bb": gappedPeak_bb_fname, "broadpeaks_bb": broadPeak_bb_fname, "fc_signal": fc_signal_fn, "pvalue_signal": pvalue_signal_fn } return output
def main(input_tagAlign, paired_end, spp_version): input_tagAlign_file = dxpy.DXFile(input_tagAlign) input_tagAlign_filename = input_tagAlign_file.name input_tagAlign_basename = input_tagAlign_file.name.rstrip('.gz') dxpy.download_dxfile(input_tagAlign_file.get_id(), input_tagAlign_filename) uncompressed_TA_filename = input_tagAlign_basename out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)]) # ================================= # Subsample tagAlign file # ================================ NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_tagAlign_basename + \ ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (uncompressed_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp_nodups.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign") peaks_filename = output_filename_prefix + ".regionPeak" final_peaks_filename = peaks_filename + ".gz" # spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + ".pdf" xcor_scores_filename = output_filename_prefix + ".ccscores" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, "r") as f: line = f.readline() fragment_length = int(line.split("\t")[fraglen_column - 1]) print "Read fragment length: %d" % (fragment_length) # run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz" if nodups: run_spp = "/phantompeakqualtools/run_spp_nodups.R" else: run_spp = "/phantompeakqualtools/run_spp.R" # install spp print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % ( run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename, ) print spp_command process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) for line in iter(process.stdout.readline, ""): sys.stdout.write(line) # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are coordinates # slopBed adjusts feature end coordinates that go off the end of the chromosome # bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak" out, err = common.run_pipe( [ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename), "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename), ] ) # These lines transfer the peaks files to the temporary workspace for debugging later # Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" % (n_spp_peaks) print "%s of those peaks removed due to bad coordinates" % ( n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename) ) print "First 50 peaks" print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + ".gz" print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' %(rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' %(rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' %(pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' %(pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' %(pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' %(peak_type) # Output filenames if prefix: basename = prefix else: m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' %(basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' %(basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' %(peak_type) overlap_pr_fn = 'replicated_pr.%s' %(peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(overlap_tr_fn, overlap_pr_fn), 'sort -u' ], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks" : dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb" : dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks" : dxpy.dxlink(rejected_peaks), "rejected_peaks_bb" : dxpy.dxlink(rejected_peaks_bb), "npeaks_in" : npeaks_in, "npeaks_out" : npeaks_out, 'npeaks_rejected' : npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version, samtools_version, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) samtools = SAMTOOLS_PATH.get(samtools_version) assert samtools, "samtools version %s is not supported" % (samtools_version) bwa = BWA_PATH.get(bwa_version) assert bwa, "BWA version %s is not supported" % (bwa_version) logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa)) indexed_reads_filenames = [] unmapped_reads_filenames = [] for i, reads in enumerate(indexed_reads): read_pair_number = i+1 fn = dxpy.describe(reads)['name'] logger.info("indexed_reads %d: %s" % (read_pair_number, fn)) indexed_reads_filenames.append(fn) dxpy.download_dxfile(reads, fn) unmapped = unmapped_reads[i] fn = dxpy.describe(unmapped)['name'] logger.info("unmapped reads %d: %s" % (read_pair_number, fn)) unmapped_reads_filenames.append(fn) dxpy.download_dxfile(unmapped, fn) reference_tar_filename = dxpy.describe(reference_tar)['name'] logger.info("reference_tar: %s" % (reference_tar_filename)) dxpy.download_dxfile(reference_tar, reference_tar_filename) # extract the reference files from the tar reference_dirname = 'reference_files' reference_filename = \ resolve_reference(reference_tar_filename, reference_dirname) logger.info("Using reference file: %s" % (reference_filename)) paired_end = len(indexed_reads) == 2 if paired_end: r1_basename = strip_extensions( unmapped_reads_filenames[0], STRIP_EXTENSIONS) r2_basename = strip_extensions( unmapped_reads_filenames[1], STRIP_EXTENSIONS) reads_basename = r1_basename + r2_basename else: reads_basename = strip_extensions( unmapped_reads_filenames[0], STRIP_EXTENSIONS) raw_bam_filename = '%s.raw.srt.bam' % (reads_basename) raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename) if paired_end: reads1_filename = indexed_reads_filenames[0] reads2_filename = indexed_reads_filenames[1] unmapped_reads1_filename = unmapped_reads_filenames[0] unmapped_reads2_filename = unmapped_reads_filenames[1] raw_sam_filename = reads_basename + ".raw.sam" badcigar_filename = "badreads.tmp" steps = [ "%s sampe -P %s %s %s %s %s" % (bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename), "tee %s" % (raw_sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq"] out, err = common.run_pipe(steps, badcigar_filename) print(out) if err: logger.error("sampe error: %s" % (err)) steps = [ "cat %s" % (raw_sam_filename), "grep -v -F -f %s" % (badcigar_filename)] else: # single end reads_filename = indexed_reads_filenames[0] unmapped_reads_filename = unmapped_reads_filenames[0] steps = [ "%s samse %s %s %s" % (bwa, reference_filename, reads_filename, unmapped_reads_filename)] if samtools_version == "0.1.9": steps.extend([ "%s view -Su -" % (samtools), "%s sort - %s" % (samtools, raw_bam_filename.rstrip('.bam'))]) # samtools adds .bam else: steps.extend([ "%s view -@%d -Su -" % (samtools, cpu_count()), "%s sort -@%d - %s" % (samtools, cpu_count(), raw_bam_filename.rstrip('.bam'))]) # samtools adds .bam logger.info("Running pipe: %s" % (steps)) out, err = common.run_pipe(steps) if out: print(out) if err: logger.error("samtools error: %s" % (err)) with open(raw_bam_mapstats_filename, 'w') as fh: subprocess.check_call( shlex.split("%s flagstat %s" % (samtools, raw_bam_filename)), stdout=fh) print(subprocess.check_output('ls -l', shell=True)) mapped_reads = dxpy.upload_local_file(raw_bam_filename) mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename) flagstat_qc = flagstat_parse(raw_bam_mapstats_filename) output = { 'mapped_reads': dxpy.dxlink(mapped_reads), 'mapping_statistics': dxpy.dxlink(mapping_statistics), 'n_mapped_reads': flagstat_qc.get('mapped')[0] # 0 is hi-q reads } logger.info("Returning from postprocess with output: %s" % (output)) return output
def main(input_bam, paired_end, samtools_version, samtools_params, picard_version, scrub, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) samtools = SAMTOOLS_PATH.get(samtools_version) assert samtools, "samtools version %s is not supported" % ( samtools_version) picard = PICARD_PATH.get(picard_version) assert picard, "picard version %s is not supported" % (picard_version) logger.info("In postprocess with samtools %s and picard %s" % (samtools, picard)) raw_bam_file = input_bam raw_bam_filename = raw_bam_file raw_bam_basename = raw_bam_file.rstrip('.bam') raw_bam_file_mapstats_filename = raw_bam_basename + '.flagstat.qc' subprocess.check_output('set -x; ls -l', shell=True) # Generate initial mapping statistics with open(raw_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "%s flagstat %s" % (samtools, raw_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) filt_bam_prefix = raw_bam_basename + ".filt.srt" filt_bam_filename = filt_bam_prefix + ".bam" if paired_end: # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Only keep properly paired reads # Obtain name sorted BAM file # ================== tmp_filt_bam_prefix = "%s.tmp" % (filt_bam_prefix ) # was tmp.prefix.nmsrt tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" out, err = common.run_pipe([ # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; # -q 30 exclude MAPQ < 30; -u uncompressed output # exclude FLAG 1804: unmapped, next segment unmapped, secondary # alignments, not passing platform q, PCR or optical duplicates # require FLAG 2: properly aligned "%s view -F 1804 -f 2 %s -u %s" % (samtools, samtools_params, raw_bam_filename), # sort: -n sort by name; - take input from stdin; # out to specified filename # Will produce name sorted BAM "%s sort -n - %s" % (samtools, tmp_filt_bam_prefix) ]) if err: logger.error("samtools error: %s" % (err)) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes # Obtain position sorted BAM subprocess.check_output('set -x; ls -l', shell=True) out, err = common.run_pipe([ # fill in mate coordinates, ISIZE and mate-related flags # fixmate requires name-sorted alignment; -r removes secondary and # unmapped (redundant here because already done above?) # - send output to stdout "%s fixmate -r %s -" % (samtools, tmp_filt_bam_filename), # repeat filtering after mate repair "%s view -F 1804 -f 2 -u -" % (samtools), # produce the coordinate-sorted BAM "%s sort - %s" % (samtools, filt_bam_prefix) ]) subprocess.check_output('set -x; ls -l', shell=True) else: # single-end data # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file # ================== with open(filt_bam_filename, 'w') as fh: samtools_filter_command = ( "%s view -F 1804 %s -b %s" % (samtools, samtools_params, raw_bam_filename)) logger.info(samtools_filter_command) subprocess.check_call(shlex.split(samtools_filter_command), stdout=fh) # ======================== # Mark duplicates # ====================== tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam" dup_file_qc_filename = raw_bam_basename + ".dup.qc" picard_string = ' '.join([ "java -Xmx4G -jar %s" % (picard), "INPUT=%s" % (filt_bam_filename), "OUTPUT=%s" % (tmp_filt_bam_filename), "METRICS_FILE=%s" % (dup_file_qc_filename), "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true", "REMOVE_DUPLICATES=false" ]) logger.info(picard_string) subprocess.check_output(shlex.split(picard_string)) os.rename(tmp_filt_bam_filename, filt_bam_filename) if paired_end: final_bam_prefix = raw_bam_basename + ".filt.srt.nodup" else: final_bam_prefix = raw_bam_basename + ".filt.nodup.srt" final_bam_filename = final_bam_prefix + ".bam" # To be stored final_bam_index_filename = final_bam_filename + ".bai" # To be stored # QC file final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" if paired_end: samtools_dedupe_command = \ "%s view -F 1804 -f2 -b %s" % (samtools, filt_bam_filename) else: samtools_dedupe_command = \ "%s view -F 1804 -b %s" % (samtools, filt_bam_filename) # ============================ # Remove duplicates # Index final position sorted BAM # ============================ with open(final_bam_filename, 'w') as fh: logger.info(samtools_dedupe_command) subprocess.check_call(shlex.split(samtools_dedupe_command), stdout=fh) # Index final bam file samtools2 = SAMTOOLS_PATH.get("1.3.1") samtools_index_command = \ "%s index %s %s" % (samtools2, final_bam_filename, final_bam_index_filename) logger.info(samtools_index_command) subprocess.check_output(shlex.split(samtools_index_command)) # Generate mapping statistics with open(final_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "%s flagstat %s" % (samtools, final_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) # ============================= # Compute library complexity # ============================= # Sort by name # convert to bedPE and obtain fragment coordinates # sort by position and strand # Obtain unique count statistics pbc_file_qc_filename = final_bam_prefix + ".pbc.qc" # PBC File output # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] # TwoReadPairs [tab] # NRF=Distinct/Total [tab] # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair if paired_end: steps = [ "%s sort -no %s -" % (samtools, filt_bam_filename), "bamToBed -bedpe -i stdin", r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""" ] else: steps = [ "bamToBed -i %s" % (filt_bam_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""" ] steps.extend([ "grep -v 'chrM'", "sort", "uniq -c", r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{if(m2){printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}else{printf "%d\t%d\t%d\t%d\t%f\t%f\t%s\n",mt,m0,m1,m2,m0/mt,m1/m0,"Inf"}}'""" ]) out, err = common.run_pipe(steps, pbc_file_qc_filename) if err: logger.error("PBC file error: %s" % (err)) output = {} logger.info("Uploading results files to the project") filtered_bam = final_bam_filename filtered_bam_index = final_bam_index_filename output.update({ "filtered_bam": filtered_bam, "filtered_bam_index": filtered_bam_index }) # If the scrub parameter is true, pass the bams to the scrub applet. if scrub: scrub_subjob = scrub_main([input_bam, filtered_bam]) scrubbed_unfiltered_bam = scrub_subjob.get("scrubbed_bams")[0] scrubbed_filtered_bam = scrub_subjob.get("scrubbed_bams")[1] # Add the optional scrubbed outputs. output.update({ "scrubbed_unfiltered_bam": scrubbed_unfiltered_bam, "scrubbed_filtered_bam": scrubbed_filtered_bam }) # Upload or calculate the remaining outputs. filtered_mapstats = final_bam_file_mapstats_filename dup_file = dup_file_qc_filename pbc_file = pbc_file_qc_filename logger.info("Calcualting QC metrics") dup_qc = dup_parse(dup_file_qc_filename) pbc_qc = pbc_parse(pbc_file_qc_filename) initial_mapstats_qc = flagstat_parse(raw_bam_file_mapstats_filename) final_mapstats_qc = flagstat_parse(final_bam_file_mapstats_filename) if paired_end: useable_fragments = final_mapstats_qc.get('in_total')[0] / 2 else: useable_fragments = final_mapstats_qc.get('in_total')[0] logger.info("initial_mapstats_qc: %s" % (initial_mapstats_qc)), logger.info("final_mapstats_qc: %s" % (final_mapstats_qc)), logger.info("dup_qc: %s" % (dup_qc)) logger.info("pbc_qc: %s" % (pbc_qc)) # Return links to the output files and values. output.update({ "filtered_mapstats": filtered_mapstats, "dup_file_qc": dup_file, "pbc_file_qc": pbc_file, "paired_end": paired_end, "n_reads_input": str(initial_mapstats_qc.get('in_total')[0]), "picard_read_pairs_examined": str(dup_qc.get('read_pairs_examined')), "picard_unpaired_reads_examined": str(dup_qc.get('unpaired_reads_examined')), "picard_read_pair_duplicates": str(dup_qc.get('read_pair_duplicates')), "picard_unpaired_read_duplicates": str(dup_qc.get('unpaired_read_duplicates')), "useable_fragments": str(useable_fragments), "NRF": str(pbc_qc.get('NRF')), "PBC1": str(pbc_qc.get('PBC1')), "PBC2": str(pbc_qc.get('PBC2')), "duplicate_fraction": str(dup_qc.get('percent_duplication')) }) parse_file = final_bam_prefix + ".parse" with open(parse_file, "w") as fh: for key, val in output.items(): if isinstance(val, list): fh.write(": ".join([key, ", ".join(val)]) + "\n") else: fh.write(": ".join([key, str(val)]) + "\n") logger.info("Exiting with output:\n%s" % (pformat(output))) return output
def main(input_tagAlign, paired_end): input_tagAlign_filename = input_tagAlign input_tagAlign_basename = input_tagAlign_filename.rstrip('.gz') uncompressed_TA_filename = input_tagAlign_basename out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)]) # ================================= # Subsample tagAlign file # ================================ NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_tagAlign_basename + \ ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (uncompressed_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag # spp will be installed in the docker container, so this is not needed # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp # refer cwd for testing # does this really have to be with _no_dups run_spp_command = SPP_TOOL_PATH out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "CC_scores_file": CC_scores_filename, "CC_plot_file": CC_plot_filename, "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile( xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info(subprocess.check_output( 'ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) logger.info("Read fragment length: %d" % (fragment_length)) spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output( 'head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version, samtools_version, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) samtools = SAMTOOLS_PATH.get(samtools_version) assert samtools, "samtools version %s is not supported" % ( samtools_version) bwa = BWA_PATH.get(bwa_version) assert bwa, "BWA version %s is not supported" % (bwa_version) logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa)) indexed_reads_filenames = [] unmapped_reads_filenames = [] for i, reads in enumerate(indexed_reads): read_pair_number = i + 1 fn = dxpy.describe(reads)['name'] logger.info("indexed_reads %d: %s" % (read_pair_number, fn)) indexed_reads_filenames.append(fn) dxpy.download_dxfile(reads, fn) unmapped = unmapped_reads[i] fn = dxpy.describe(unmapped)['name'] logger.info("unmapped reads %d: %s" % (read_pair_number, fn)) unmapped_reads_filenames.append(fn) dxpy.download_dxfile(unmapped, fn) reference_tar_filename = dxpy.describe(reference_tar)['name'] logger.info("reference_tar: %s" % (reference_tar_filename)) dxpy.download_dxfile(reference_tar, reference_tar_filename) # extract the reference files from the tar reference_dirname = 'reference_files' reference_filename = \ resolve_reference(reference_tar_filename, reference_dirname) logger.info("Using reference file: %s" % (reference_filename)) paired_end = len(indexed_reads) == 2 if paired_end: r1_basename = strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS) r2_basename = strip_extensions(unmapped_reads_filenames[1], STRIP_EXTENSIONS) reads_basename = r1_basename + r2_basename else: reads_basename = strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS) raw_bam_filename = '%s.raw.srt.bam' % (reads_basename) raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename) if paired_end: reads1_filename = indexed_reads_filenames[0] reads2_filename = indexed_reads_filenames[1] unmapped_reads1_filename = unmapped_reads_filenames[0] unmapped_reads2_filename = unmapped_reads_filenames[1] raw_sam_filename = reads_basename + ".raw.sam" badcigar_filename = "badreads.tmp" steps = [ "%s sampe -P %s %s %s %s %s" % (bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename), "tee %s" % (raw_sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq" ] out, err = common.run_pipe(steps, badcigar_filename) print(out) if err: logger.error("sampe error: %s" % (err)) steps = [ "cat %s" % (raw_sam_filename), "grep -v -F -f %s" % (badcigar_filename) ] else: # single end reads_filename = indexed_reads_filenames[0] unmapped_reads_filename = unmapped_reads_filenames[0] steps = [ "%s samse %s %s %s" % (bwa, reference_filename, reads_filename, unmapped_reads_filename) ] if samtools_version == "0.1.9": steps.extend([ "%s view -Su -" % (samtools), "%s sort - %s" % (samtools, raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam else: steps.extend([ "%s view -@%d -Su -" % (samtools, cpu_count()), "%s sort -@%d - %s" % (samtools, cpu_count(), raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam logger.info("Running pipe: %s" % (steps)) out, err = common.run_pipe(steps) if out: print(out) if err: logger.error("samtools error: %s" % (err)) with open(raw_bam_mapstats_filename, 'w') as fh: subprocess.check_call(shlex.split("%s flagstat %s" % (samtools, raw_bam_filename)), stdout=fh) print(subprocess.check_output('ls -l', shell=True)) mapped_reads = dxpy.upload_local_file(raw_bam_filename) mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename) flagstat_qc = flagstat_parse(raw_bam_mapstats_filename) output = { 'mapped_reads': dxpy.dxlink(mapped_reads), 'mapping_statistics': dxpy.dxlink(mapping_statistics), 'n_mapped_reads': flagstat_qc.get('mapped')[0] # 0 is hi-q reads } logger.info("Returning from postprocess with output: %s" % (output)) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, spp_version, as_file=None, prefix=None, fragment_length=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = \ experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' # spp adds .gz, so this is the file name that's actually created final_peaks_filename = peaks_filename + '.gz' xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' logger.info( subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)) # third column in the cross-correlation scores input file # if fragment_length is provided, use that. Else read # fragment length from xcor file if fragment_length is not None: fraglen = str(fragment_length) logger.info("User given fragment length %s" % (fraglen)) else: fraglen_column = 3 with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fraglen = line.split('\t')[fraglen_column - 1] logger.info("Read fragment length: %s" % (fraglen)) # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) run_spp = '/phantompeakqualtools/run_spp.R' spp_command = ( "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s" % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename)) logger.info(spp_command) subprocess.check_call(shlex.split(spp_command)) # when one of the peak coordinates are an exact multiple of 10, spp (R) # outputs the coordinate in scientific notation # this changes any such coodinates to decimal notation # this assumes 10-column output and that the 2nd and 3rd columns are # coordinates # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a # negative start coordinate (particularly chrM) and will cause slopBed # to halt at that line, truncating the output of the pipe # slopBed adjusts feature end coordinates that go off the end of the # chromosome # bedClip removes any features that are still not within the boundaries of # the chromosome fix_coordinate_peaks_filename = \ output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" % (final_peaks_filename), "tee %s" % (peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename), 'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename) ]) # These lines transfer the peaks files to the temporary workspace for # debugging later # Only at the end are the final files uploaded that will be returned from # the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) logger.info("%s peaks called by spp" % (n_spp_peaks)) logger.info( "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))) print("First 50 peaks") subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename), shell=True) if bigbed: peaks_bb_filename = \ common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename): logger.info("Returning peaks with fixed coordinates") subprocess.check_call( shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' subprocess.check_call('ls -l', shell=True) # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(input_tagAlign, paired_end, spp_version): input_tagAlign_file = dxpy.DXFile(input_tagAlign) input_tagAlign_filename = input_tagAlign_file.name input_tagAlign_basename = input_tagAlign_file.name.rstrip('.gz') dxpy.download_dxfile(input_tagAlign_file.get_id(), input_tagAlign_filename) uncompressed_TA_filename = input_tagAlign_basename out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)]) # ================================= # Subsample tagAlign file # ================================ NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_tagAlign_basename + \ ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (uncompressed_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename)] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename)]) out, err = common.run_pipe([ r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe([ "mv temp %s" % (CC_scores_filename)]) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } return output
def main(input_bam, paired_end, samtools_params, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # input_json is no longer used # # if there is input_JSON, it over-rides any explicit parameters # if input_JSON: # if 'input_bam' in input_JSON: # input_bam = input_JSON['input_bam'] # if 'paired_end' in input_JSON: # paired_end = input_JSON['paired_end'] # if 'samtools_params' in input_JSON: # samtools_params = input_JSON['samtools_params'] # this is now handled by the platform input validator # if not input_bam: # logger.error('input_bam is required') # raise Exception # assert paired_end is not None, 'paired_end is required, explicitly or in input_JSON' raw_bam_file = dxpy.DXFile(input_bam) raw_bam_filename = raw_bam_file.name raw_bam_basename = raw_bam_file.name.rstrip('.bam') dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename) subprocess.check_output('set -x; ls -l', shell=True) filt_bam_prefix = raw_bam_basename + ".filt.srt" filt_bam_filename = filt_bam_prefix + ".bam" if paired_end: # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Only keep properly paired reads # Obtain name sorted BAM file # ================== tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix ) # was tmp.prefix.nmsrt tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" out, err = common.run_pipe([ # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; # -q 30 exclude MAPQ < 30; -u uncompressed output # exclude FLAG 1804: unmapped, next segment unmapped, secondary # alignments, not passing platform q, PCR or optical duplicates # require FLAG 2: properly aligned "samtools view -F 1804 -f 2 %s -u %s" % (samtools_params, raw_bam_filename), # sort: -n sort by name; - take input from stdin; # out to specified filename # Will produce name sorted BAM "samtools sort -n - %s" % (tmp_filt_bam_prefix) ]) if err: logger.error("samtools error: %s" % (err)) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes # Obtain position sorted BAM subprocess.check_output('set -x; ls -l', shell=True) out, err = common.run_pipe([ # fill in mate coordinates, ISIZE and mate-related flags # fixmate requires name-sorted alignment; -r removes secondary and # unmapped (redundant here because already done above?) # - send output to stdout "samtools fixmate -r %s -" % (tmp_filt_bam_filename), # repeat filtering after mate repair "samtools view -F 1804 -f 2 -u -", # produce the coordinate-sorted BAM "samtools sort - %s" % (filt_bam_prefix) ]) subprocess.check_output('set -x; ls -l', shell=True) else: # single-end data # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file # ================== with open(filt_bam_filename, 'w') as fh: samtools_filter_command = ("samtools view -F 1804 %s -b %s" % (samtools_params, raw_bam_filename)) logger.info(samtools_filter_command) subprocess.check_call(shlex.split(samtools_filter_command), stdout=fh) # ======================== # Mark duplicates # ====================== tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam" dup_file_qc_filename = raw_bam_basename + ".dup.qc" picard_string = ' '.join([ "java -Xmx4G -jar /picard/MarkDuplicates.jar", "INPUT=%s" % (filt_bam_filename), "OUTPUT=%s" % (tmp_filt_bam_filename), "METRICS_FILE=%s" % (dup_file_qc_filename), "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true", "REMOVE_DUPLICATES=false" ]) logger.info(picard_string) subprocess.check_output(shlex.split(picard_string)) os.rename(tmp_filt_bam_filename, filt_bam_filename) if paired_end: final_bam_prefix = raw_bam_basename + ".filt.srt.nodup" else: final_bam_prefix = raw_bam_basename + ".filt.nodup.srt" final_bam_filename = final_bam_prefix + ".bam" # To be stored final_bam_index_filename = final_bam_filename + ".bai" # To be stored # QC file final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" if paired_end: samtools_dedupe_command = \ "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename) else: samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (filt_bam_filename) # ============================ # Remove duplicates # Index final position sorted BAM # ============================ with open(final_bam_filename, 'w') as fh: logger.info(samtools_dedupe_command) subprocess.check_call(shlex.split(samtools_dedupe_command), stdout=fh) # Index final bam file samtools_index_command = \ "samtools index %s %s" % (final_bam_filename, final_bam_index_filename) logger.info(samtools_index_command) subprocess.check_output(shlex.split(samtools_index_command)) # Generate mapping statistics with open(final_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "samtools flagstat %s" % (final_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) # ============================= # Compute library complexity # ============================= # Sort by name # convert to bedPE and obtain fragment coordinates # sort by position and strand # Obtain unique count statistics pbc_file_qc_filename = final_bam_prefix + ".pbc.qc" # PBC File output # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] # TwoReadPairs [tab] # NRF=Distinct/Total [tab] # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair if paired_end: steps = [ "samtools sort -no %s -" % (filt_bam_filename), "bamToBed -bedpe -i stdin", r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""" ] else: steps = [ "bamToBed -i %s" % (filt_bam_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""" ] steps.extend([ # TODO this should be implemented as an explicit list of allowable # names, so that mapping can be done to a complete reference "grep -v 'chrM'", "sort", "uniq -c", r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" ]) out, err = common.run_pipe(steps, pbc_file_qc_filename) if err: logger.error("PBC file error: %s" % (err)) logger.info("Uploading results files to the project") filtered_bam = dxpy.upload_local_file(final_bam_filename) filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename) filtered_mapstats = \ dxpy.upload_local_file(final_bam_file_mapstats_filename) dup_file = dxpy.upload_local_file(dup_file_qc_filename) pbc_file = dxpy.upload_local_file(pbc_file_qc_filename) dup_qc = dup_parse(dup_file_qc_filename) pbc_qc = pbc_parse(pbc_file_qc_filename) logger.info("dup_qc: %s" % (dup_qc)) logger.info("pbc_qc: %s" % (pbc_qc)) # Return links to the output files output = { "filtered_bam": dxpy.dxlink(filtered_bam), "filtered_bam_index": dxpy.dxlink(filtered_bam_index), "filtered_mapstats": dxpy.dxlink(filtered_mapstats), "dup_file_qc": dxpy.dxlink(dup_file), "pbc_file_qc": dxpy.dxlink(pbc_file), "paired_end": paired_end, "NRF": pbc_qc.get('NRF'), "PBC1": pbc_qc.get('PBC1'), "PBC2": pbc_qc.get('PBC2'), "duplicate_fraction": dup_qc.get('percent_duplication') } logger.info("Exiting with output:\n%s" % (pprint(output))) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) print "Read fragment length: %d" %(fragment_length) #run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz' if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' #install spp subprocess.check_call('ls -l', shell=True) subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball))) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename) print spp_command # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # for line in iter(process.stdout.readline, ''): # sys.stdout.write(line) subprocess.check_call(shlex.split(spp_command)) #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation #this changes any such coodinates to decimal notation #this assumes 10-column output and that the 2nd and 3rd columns are coordinates #slopBed adjusts feature end coordinates that go off the end of the chromosome #bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" %(final_peaks_filename), "tee %s" %(peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename), 'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename) ]) #These lines transfer the peaks files to the temporary workspace for debugging later #Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" %(n_spp_peaks) print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)) print "First 50 peaks" print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def main(input_bam, paired_end, samtools_params, scrub, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) raw_bam_file = dxpy.DXFile(input_bam) raw_bam_filename = raw_bam_file.name raw_bam_basename = raw_bam_file.name.rstrip('.bam') raw_bam_file_mapstats_filename = raw_bam_basename + '.flagstat.qc' dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename) subprocess.check_output('set -x; ls -l', shell=True) # Generate initial mapping statistics with open(raw_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "samtools flagstat %s" % (raw_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) filt_bam_prefix = raw_bam_basename + ".filt.srt" filt_bam_filename = filt_bam_prefix + ".bam" if paired_end: # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Only keep properly paired reads # Obtain name sorted BAM file # ================== tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) # was tmp.prefix.nmsrt tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" out, err = common.run_pipe([ # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; # -q 30 exclude MAPQ < 30; -u uncompressed output # exclude FLAG 1804: unmapped, next segment unmapped, secondary # alignments, not passing platform q, PCR or optical duplicates # require FLAG 2: properly aligned "samtools view -F 1804 -f 2 %s -u %s" % (samtools_params, raw_bam_filename), # sort: -n sort by name; - take input from stdin; # out to specified filename # Will produce name sorted BAM "samtools sort -n - %s" % (tmp_filt_bam_prefix)]) if err: logger.error("samtools error: %s" % (err)) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes # Obtain position sorted BAM subprocess.check_output('set -x; ls -l', shell=True) out, err = common.run_pipe([ # fill in mate coordinates, ISIZE and mate-related flags # fixmate requires name-sorted alignment; -r removes secondary and # unmapped (redundant here because already done above?) # - send output to stdout "samtools fixmate -r %s -" % (tmp_filt_bam_filename), # repeat filtering after mate repair "samtools view -F 1804 -f 2 -u -", # produce the coordinate-sorted BAM "samtools sort - %s" % (filt_bam_prefix)]) subprocess.check_output('set -x; ls -l', shell=True) else: # single-end data # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file # ================== with open(filt_bam_filename, 'w') as fh: samtools_filter_command = ( "samtools view -F 1804 %s -b %s" % (samtools_params, raw_bam_filename) ) logger.info(samtools_filter_command) subprocess.check_call( shlex.split(samtools_filter_command), stdout=fh) # ======================== # Mark duplicates # ====================== tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam" dup_file_qc_filename = raw_bam_basename + ".dup.qc" picard_string = ' '.join([ "java -Xmx4G -jar /picard/MarkDuplicates.jar", "INPUT=%s" % (filt_bam_filename), "OUTPUT=%s" % (tmp_filt_bam_filename), "METRICS_FILE=%s" % (dup_file_qc_filename), "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true", "REMOVE_DUPLICATES=false" ]) logger.info(picard_string) subprocess.check_output(shlex.split(picard_string)) os.rename(tmp_filt_bam_filename, filt_bam_filename) if paired_end: final_bam_prefix = raw_bam_basename + ".filt.srt.nodup" else: final_bam_prefix = raw_bam_basename + ".filt.nodup.srt" final_bam_filename = final_bam_prefix + ".bam" # To be stored final_bam_index_filename = final_bam_filename + ".bai" # To be stored # QC file final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" if paired_end: samtools_dedupe_command = \ "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename) else: samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (filt_bam_filename) # ============================ # Remove duplicates # Index final position sorted BAM # ============================ with open(final_bam_filename, 'w') as fh: logger.info(samtools_dedupe_command) subprocess.check_call( shlex.split(samtools_dedupe_command), stdout=fh) # Index final bam file samtools_index_command = \ "samtools index %s %s" % (final_bam_filename, final_bam_index_filename) logger.info(samtools_index_command) subprocess.check_output(shlex.split(samtools_index_command)) # Generate mapping statistics with open(final_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "samtools flagstat %s" % (final_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) # ============================= # Compute library complexity # ============================= # Sort by name # convert to bedPE and obtain fragment coordinates # sort by position and strand # Obtain unique count statistics pbc_file_qc_filename = final_bam_prefix + ".pbc.qc" # PBC File output # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] # TwoReadPairs [tab] # NRF=Distinct/Total [tab] # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair if paired_end: steps = [ "samtools sort -no %s -" % (filt_bam_filename), "bamToBed -bedpe -i stdin", r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] else: steps = [ "bamToBed -i %s" % (filt_bam_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] steps.extend([ "grep -v 'chrM'", "sort", "uniq -c", r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" ]) out, err = common.run_pipe(steps, pbc_file_qc_filename) if err: logger.error("PBC file error: %s" % (err)) output = {} logger.info("Uploading results files to the project") filtered_bam = dxpy.upload_local_file(final_bam_filename) filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename) output.update({ "filtered_bam": dxpy.dxlink(filtered_bam), "filtered_bam_index": dxpy.dxlink(filtered_bam_index) }) # If the scrub parameter is true, pass the bams to the scrub applet. if scrub: scrub_applet = dxpy.find_one_data_object( classname='applet', name='scrub', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) scrub_subjob = \ scrub_applet.run( {"input_bams": [input_bam, dxpy.dxlink(filtered_bam)]}, name='Scrub bams') scrubbed_unfiltered_bam = scrub_subjob.get_output_ref("scrubbed_bams", index=0) scrubbed_filtered_bam = scrub_subjob.get_output_ref("scrubbed_bams", index=1) # Add the optional scrubbed outputs. output.update({ "scrubbed_unfiltered_bam": dxpy.dxlink(scrubbed_unfiltered_bam), "scrubbed_filtered_bam": dxpy.dxlink(scrubbed_filtered_bam) }) # Upload or calculate the remaining outputs. filtered_mapstats = \ dxpy.upload_local_file(final_bam_file_mapstats_filename) dup_file = dxpy.upload_local_file(dup_file_qc_filename) pbc_file = dxpy.upload_local_file(pbc_file_qc_filename) logger.info("Calcualting QC metrics") dup_qc = dup_parse(dup_file_qc_filename) pbc_qc = pbc_parse(pbc_file_qc_filename) initial_mapstats_qc = flagstat_parse(raw_bam_file_mapstats_filename) final_mapstats_qc = flagstat_parse(final_bam_file_mapstats_filename) if paired_end: useable_fragments = final_mapstats_qc.get('in_total')[0]/2 else: useable_fragments = final_mapstats_qc.get('in_total')[0] logger.info("initial_mapstats_qc: %s" % (initial_mapstats_qc)), logger.info("final_mapstats_qc: %s" % (final_mapstats_qc)), logger.info("dup_qc: %s" % (dup_qc)) logger.info("pbc_qc: %s" % (pbc_qc)) # Return links to the output files and values. output.update({ "filtered_mapstats": dxpy.dxlink(filtered_mapstats), "dup_file_qc": dxpy.dxlink(dup_file), "pbc_file_qc": dxpy.dxlink(pbc_file), "paired_end": paired_end, "n_reads_input": str(initial_mapstats_qc.get('in_total')[0]), "picard_read_pairs_examined": str(dup_qc.get('read_pairs_examined')), "picard_unpaired_reads_examined": str(dup_qc.get('unpaired_reads_examined')), "picard_read_pair_duplicates": str(dup_qc.get('read_pair_duplicates')), "picard_unpaired_read_duplicates": str(dup_qc.get('unpaired_read_duplicates')), "useable_fragments": str(useable_fragments), "NRF": str(pbc_qc.get('NRF')), "PBC1": str(pbc_qc.get('PBC1')), "PBC2": str(pbc_qc.get('PBC2')), "duplicate_fraction": str(dup_qc.get('percent_duplication')) }) logger.info("Exiting with output:\n%s" % (pformat(output))) return output
def main(input_tags, prefix=None): input_tags_file = input_tags input_tags_filename = input_tags_file # introspect the file to determine tagAlign (thus SE) or BEDPE (thus PE) # strip extension as appropriate subprocess.check_output('ls', shell=True) with gzip.open(input_tags_filename) as f: firstline = f.readline() logger.info('First line of input_tags:\n%s' % (firstline)) se_cols = 6 pe_cols = 10 firstline = firstline.decode("utf-8") if re.match('^(\S+[\t\n]){%d}$' % (se_cols), firstline): paired_end = False input_tags_basename = prefix or input_tags_filename.rstrip( '.tagAlign.gz') filename_infix = 'SE' logger.info("Detected single-end data") elif re.match('^(\S+[\t\n]){%d}$' % (pe_cols), firstline): paired_end = True input_tags_basename = prefix or input_tags_filename.rstrip('.bedpe.gz') filename_infix = 'PE2SE' logger.info("Detected paired-end data") else: raise IOError("%s is neither a BEDPE or tagAlign file" % (input_tags_filename)) pr_ta_filenames = \ [input_tags_basename + ".%s.pr1.tagAlign.gz" % (filename_infix), input_tags_basename + ".%s.pr2.tagAlign.gz" % (filename_infix)] # count lines in the file out, err = common.run_pipe( ['gzip -dc %s' % (input_tags_filename), 'wc -l']) # number of lines in each split nlines = (int(out) + 1) / 2 # Shuffle and split BEDPE file into 2 equal parts # by using the input to seed shuf we ensure multiple runs with the same # input will produce the same output # Produces two files named splits_prefix0n, n=1,2 splits_prefix = 'temp_split' out, err = common.run_pipe([ 'gzip -dc %s' % (input_tags_filename), 'shuf --random-source=%s' % (input_tags_filename), 'split -a 2 -d -l %d - %s' % (nlines, splits_prefix) ]) # Convert read pairs to reads into standard tagAlign file for i, index in enumerate(['00', '01']): # could be made multi-threaded steps = ['cat %s' % (splits_prefix + index)] if paired_end: steps.extend([ r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""" ]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=pr_ta_filenames[i]) pseudoreplicate1_file = pr_ta_filenames[0] pseudoreplicate2_file = pr_ta_filenames[1] output = { "pseudoreplicate1": pseudoreplicate1_file, "pseudoreplicate2": pseudoreplicate2_file } return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, chrom_sizes, as_file, peak_type, prefix=None, rep1_signal=None, rep2_signal=None, pooled_signal=None): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances rep1_peaks = dxpy.DXFile(rep1_peaks) rep2_peaks = dxpy.DXFile(rep2_peaks) pooled_peaks = dxpy.DXFile(pooled_peaks) pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks) chrom_sizes = dxpy.DXFile(chrom_sizes) as_file = dxpy.DXFile(as_file) #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent #file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: m = re.match( '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) #strip off the peak and compression extensions if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file.get_id(), as_file_fn) ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print "%d peaks overlap with both true replicates" % ( common.count_lines(overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" % ( common.count_lines(overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" % ( common.count_lines(overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)) npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) #make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # rejected_peaks_bb_fn = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, 'npeaks_rejected': npeaks_rejected } # These are just passed through for convenience so that signals and tracks # are available in one place. Both input and output are optional. if rep1_signal: output.update({"rep1_signal": rep1_signal}) if rep2_signal: output.update({"rep2_signal": rep2_signal}) if pooled_signal: output.update({"pooled_signal": pooled_signal}) return output
def main(input_bam, paired_end, spp_version): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn" ], outfile=final_TA_filename) # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn" ], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ logger.info("Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename))) NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) logger.info("Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename))) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp_nodups.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": dxpy.dxlink(tagAlign_file), "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } if paired_end: output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)}) return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks, rep1_ta, rep1_xcor, rep2_ta, rep2_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks) pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name) pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name) rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name) rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn) dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') # If fragment length was given by user we skip pooled_replicates # _xcor_subjob, set the pool_xcor_filename to None, and update # the flag fragment_length_given_by_user. Otherwise, run the subjob # to be able to extract the fragment length fron cross-correlations. if fragment_length is not None: pool_xcor_filename = None fraglen = fragment_length fragment_length_given_by_user = True else: pooled_replicates_xcor_subjob = \ xcor_only( pool_replicates_subjob.get_output_ref("pooled"), paired_end, spp_version=None, name='Pool cross-correlation') pooled_replicates_xcor_subjob.wait_on_done() pool_xcor_link = pooled_replicates_xcor_subjob.describe( )['output'].get("CC_scores_file") pool_xcor_file = dxpy.get_handler(pool_xcor_link) pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name) dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename) fraglen = common.xcor_fraglen(pool_xcor_filename) fragment_length_given_by_user = False pool_replicates_subjob.wait_on_done() pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled") pool_ta_file = dxpy.get_handler(pool_ta_link) pool_ta_filename = 'poolta_%s' % (pool_ta_file.name) dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where # overlap is defined as the fractional overlap wrt any one of the # overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" % (common.count_lines(overlap_pr_fn))) # Combine peak lists out, err = common.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudoreplicates" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def process(self): ''' #find pooled peaks that are in (rep1 AND rep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn) ], overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn)) #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn), 'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn) ], overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn)) #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2) out, err = common.run_pipe([ 'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn), 'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn) ], overlapping_peaks_fn) print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn)) ''' #the only difference between the peak_types is how the extra columns are handled if self.peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif self.peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif self.peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: print "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." sys.exit() # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_tr_fn) print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn)) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u' ], self.overlap_pr_fn) print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn)) # Combine peak lists out, err = common.run_pipe([ 'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn), 'sort -u' ], self.overlapping_peaks_fn) print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn)) #rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn) ], self.rejected_peaks_fn) print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn)) self.npeaks_in = common.count_lines(common.uncompress(self.pooled_peaks_fn)) self.npeaks_out = common.count_lines(self.overlapping_peaks_fn) self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn) #make bigBed files for visualization self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type) self.rejected_peaks_bb_fn = common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks, rep1_ta, rep1_xcor, paired_end, chrom_sizes, as_file, peak_type, prefix, fragment_length=None): rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_ta_file = dxpy.DXFile(rep1_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) # Input filenames - necessary to define each explicitly because input files # could have the same name, in which case subsequent # file would overwrite previous file rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name) rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name) pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name) rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name) rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name) chrom_sizes_fn = 'chrom.sizes' as_file_fn = '%s.as' % (peak_type) # Output filenames if prefix: basename = prefix else: # strip off the peak and compression extensions m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type), pooled_peaks.name) if m: basename = m.group(1) else: basename = pooled_peaks.name overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type) overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb' rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type) rejected_peaks_bb_fn = rejected_peaks_fn + '.bb' # Intermediate filenames overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Download file inputs to the local file system with local filenames dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn) dxpy.download_dxfile(as_file_file.get_id(), as_file_fn) logger.info(subprocess.check_output('set -x; ls -l', shell=True)) # the only difference between the peak_types is how the extra columns are # handled if peak_type == "narrowPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' bed_type = 'bed6+4' elif peak_type == "gappedPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-15' bed_type = 'bed12+3' elif peak_type == "broadPeak": awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-9' bed_type = 'bed6+3' else: assert peak_type in [ 'narrowPeak', 'gappedPeak', 'broadPeak' ], "%s is unrecognized. peak_type should be narrowPeak, gappedPeak or broadPeak." % ( peak_type) # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as # the fractional overlap wrt any one of the overlapping peak pairs > 0.5 out, err = common.run_pipe([ 'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn), awk_command, cut_command, 'sort -u' ], overlap_tr_fn) print("%d peaks overlap with both true replicates" % (common.count_lines(overlap_tr_fn))) # this is a simplicate analysis # overlapping peaks are just based on pseudoreps of the one pool out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'], overlapping_peaks_fn) print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn))) # rejected peaks out, err = common.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))) # calculate FRiP (Fraction of Reads in Peaks) # Extract the fragment length estimate from column 3 of the # cross-correlation scores file or use the user-defined # fragment_length if given. if fragment_length is not None: fraglen = fragment_length fragment_length_given_by_user = True else: fraglen = common.xcor_fraglen(rep1_xcor_fn) fragment_length_given_by_user = False # FRiP reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type) n_reads, n_reads_in_peaks, frip_score = common.frip( rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn, chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn) # count peaks npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn)) npeaks_out = common.count_lines(overlapping_peaks_fn) npeaks_rejected = common.count_lines(rejected_peaks_fn) # make bigBed files for visualization overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type) # Upload file outputs from the local file system. overlapping_peaks = dxpy.upload_local_file( common.compress(overlapping_peaks_fn)) overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn) rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn)) rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn) output = { "overlapping_peaks": dxpy.dxlink(overlapping_peaks), "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb), "rejected_peaks": dxpy.dxlink(rejected_peaks), "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb), "npeaks_in": npeaks_in, "npeaks_out": npeaks_out, "npeaks_rejected": npeaks_rejected, "frip_nreads": n_reads, "frip_nreads_in_peaks": n_reads_in_peaks, "frip_score": frip_score, "fragment_length_used": fraglen, "fragment_length_given_by_user": fragment_length_given_by_user } return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. experiment = dxpy.DXFile(experiment) control = dxpy.DXFile(control) xcor_scores_input = dxpy.DXFile(xcor_scores_input) chrom_sizes = dxpy.DXFile(chrom_sizes) narrowPeak_as = dxpy.DXFile(narrowpeak_as) gappedPeak_as = dxpy.DXFile(gappedpeak_as) broadPeak_as = dxpy.DXFile(broadpeak_as) # Download the file inputs to the local file system # and use their own filenames. dxpy.download_dxfile(experiment.get_id(), experiment.name) dxpy.download_dxfile(control.get_id(), control.name) dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name) dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name) dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name) dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name) #Define the output filenames peaks_dirname = 'peaks_macs' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix = experiment.name if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" %(peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" %(peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" %(peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn) gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn) broadPeak_bb_fn = "%s.bb" %(broadPeak_fn) fc_signal_fn = "%s/%s.fc_signal.bw" %(peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" %(peaks_dirname, prefix) #Extract the fragment length estimate from column 3 of the cross-correlation scores file with open(xcor_scores_input.name,'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] #third column print "Fraglen %s" %(fraglen) #=========================================== # Generate narrow peaks and preliminary signal tracks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_narrowpeak_fn = common.slop_clip('%s/%s_peaks.narrowPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = ['sort -k 8gr,8gr %s' %(rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(narrowPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(narrowPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # Generate Broad and Gapped Peaks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_broadpeak_fn = common.slop_clip('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = ['sort -k 8gr,8gr %s' %(rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(broadPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(broadPeak_gz_fn)) # MACS2 sometimes calls features off the end of chromosomes. Fix that. clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), chrom_sizes.name) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5) pipe = ['sort -k 14gr,14gr %s' %(rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' %(gappedPeak_fn), 'gzip -c'] print pipe out,err = common.run_pipe(pipe,'%s' %(gappedPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = ['slopBed -i %s/%s_FE.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)] print pipe out, err = common.run_pipe(pipe) #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(fc_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" %(returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #============================================ # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe([ 'gzip -dc %s' %(experiment.name), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe([ 'gzip -dc %s' %(control.name), 'wc -l']) controlReads = out.strip() sval=str(min(float(chipReads), float(controlReads))/1000000) print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval) returncode = common.block_on( 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)) print "MACS2 exited with returncode %d" %(returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = ['slopBed -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)] print pipe out, err = common.run_pipe(pipe) #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(pvalue_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" %(returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = common.bed2bb('%s' %(narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' %(gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' %(broadPeak_fn), chrom_sizes.name, broadPeak_as.name, bed_type='bed6+3') #Temporary during development to create empty files just to get the applet to exit # for fn in [narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn]: # common.block_on('touch %s' %(fn)) # Upload the file outputs narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn) gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn) broadPeak = dxpy.upload_local_file(broadPeak_gz_fn) narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn) gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn) broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn) fc_signal = dxpy.upload_local_file(fc_signal_fn) pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn) # Build the output structure. output = { "narrowpeaks": dxpy.dxlink(narrowPeak), "gappedpeaks": dxpy.dxlink(gappedPeak), "broadpeaks": dxpy.dxlink(broadPeak), "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb), "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb), "broadpeaks_bb": dxpy.dxlink(broadPeak_bb), "fc_signal": dxpy.dxlink(fc_signal), "pvalue_signal": dxpy.dxlink(pvalue_signal) } return output
def main(input_bam, paired_end, samtools_params, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # input_json is no longer used # # if there is input_JSON, it over-rides any explicit parameters # if input_JSON: # if 'input_bam' in input_JSON: # input_bam = input_JSON['input_bam'] # if 'paired_end' in input_JSON: # paired_end = input_JSON['paired_end'] # if 'samtools_params' in input_JSON: # samtools_params = input_JSON['samtools_params'] # this is now handled by the platform input validator # if not input_bam: # logger.error('input_bam is required') # raise Exception # assert paired_end is not None, 'paired_end is required, explicitly or in input_JSON' raw_bam_file = dxpy.DXFile(input_bam) raw_bam_filename = raw_bam_file.name raw_bam_basename = raw_bam_file.name.rstrip('.bam') dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename) subprocess.check_output('set -x; ls -l', shell=True) filt_bam_prefix = raw_bam_basename + ".filt.srt" filt_bam_filename = filt_bam_prefix + ".bam" if paired_end: # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Only keep properly paired reads # Obtain name sorted BAM file # ================== tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) # was tmp.prefix.nmsrt tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" out, err = common.run_pipe([ # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; # -q 30 exclude MAPQ < 30; -u uncompressed output # exclude FLAG 1804: unmapped, next segment unmapped, secondary # alignments, not passing platform q, PCR or optical duplicates # require FLAG 2: properly aligned "samtools view -F 1804 -f 2 %s -u %s" % (samtools_params, raw_bam_filename), # sort: -n sort by name; - take input from stdin; # out to specified filename # Will produce name sorted BAM "samtools sort -n - %s" % (tmp_filt_bam_prefix)]) if err: logger.error("samtools error: %s" % (err)) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes # Obtain position sorted BAM subprocess.check_output('set -x; ls -l', shell=True) out, err = common.run_pipe([ # fill in mate coordinates, ISIZE and mate-related flags # fixmate requires name-sorted alignment; -r removes secondary and # unmapped (redundant here because already done above?) # - send output to stdout "samtools fixmate -r %s -" % (tmp_filt_bam_filename), # repeat filtering after mate repair "samtools view -F 1804 -f 2 -u -", # produce the coordinate-sorted BAM "samtools sort - %s" % (filt_bam_prefix)]) subprocess.check_output('set -x; ls -l', shell=True) else: # single-end data # ============================= # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file # ================== with open(filt_bam_filename, 'w') as fh: samtools_filter_command = ( "samtools view -F 1804 %s -b %s" % (samtools_params, raw_bam_filename) ) logger.info(samtools_filter_command) subprocess.check_call( shlex.split(samtools_filter_command), stdout=fh) # ======================== # Mark duplicates # ====================== tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam" dup_file_qc_filename = raw_bam_basename + ".dup.qc" picard_string = ' '.join([ "java -Xmx4G -jar /picard/MarkDuplicates.jar", "INPUT=%s" % (filt_bam_filename), "OUTPUT=%s" % (tmp_filt_bam_filename), "METRICS_FILE=%s" % (dup_file_qc_filename), "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true", "REMOVE_DUPLICATES=false" ]) logger.info(picard_string) subprocess.check_output(shlex.split(picard_string)) os.rename(tmp_filt_bam_filename, filt_bam_filename) if paired_end: final_bam_prefix = raw_bam_basename + ".filt.srt.nodup" else: final_bam_prefix = raw_bam_basename + ".filt.nodup.srt" final_bam_filename = final_bam_prefix + ".bam" # To be stored final_bam_index_filename = final_bam_filename + ".bai" # To be stored # QC file final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" if paired_end: samtools_dedupe_command = \ "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename) else: samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (filt_bam_filename) # ============================ # Remove duplicates # Index final position sorted BAM # ============================ with open(final_bam_filename, 'w') as fh: logger.info(samtools_dedupe_command) subprocess.check_call( shlex.split(samtools_dedupe_command), stdout=fh) # Index final bam file samtools_index_command = \ "samtools index %s %s" % (final_bam_filename, final_bam_index_filename) logger.info(samtools_index_command) subprocess.check_output(shlex.split(samtools_index_command)) # Generate mapping statistics with open(final_bam_file_mapstats_filename, 'w') as fh: flagstat_command = "samtools flagstat %s" % (final_bam_filename) logger.info(flagstat_command) subprocess.check_call(shlex.split(flagstat_command), stdout=fh) # ============================= # Compute library complexity # ============================= # Sort by name # convert to bedPE and obtain fragment coordinates # sort by position and strand # Obtain unique count statistics pbc_file_qc_filename = final_bam_prefix + ".pbc.qc" # PBC File output # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] # TwoReadPairs [tab] # NRF=Distinct/Total [tab] # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair if paired_end: steps = [ "samtools sort -no %s -" % (filt_bam_filename), "bamToBed -bedpe -i stdin", r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] else: steps = [ "bamToBed -i %s" % (filt_bam_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] steps.extend([ # TODO this should be implemented as an explicit list of allowable # names, so that mapping can be done to a complete reference "grep -v 'chrM'", "sort", "uniq -c", r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" ]) out, err = common.run_pipe(steps, pbc_file_qc_filename) if err: logger.error("PBC file error: %s" % (err)) logger.info("Uploading results files to the project") filtered_bam = dxpy.upload_local_file(final_bam_filename) filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename) filtered_mapstats = \ dxpy.upload_local_file(final_bam_file_mapstats_filename) dup_file = dxpy.upload_local_file(dup_file_qc_filename) pbc_file = dxpy.upload_local_file(pbc_file_qc_filename) dup_qc = dup_parse(dup_file_qc_filename) pbc_qc = pbc_parse(pbc_file_qc_filename) logger.info("dup_qc: %s" % (dup_qc)) logger.info("pbc_qc: %s" % (pbc_qc)) # Return links to the output files output = { "filtered_bam": dxpy.dxlink(filtered_bam), "filtered_bam_index": dxpy.dxlink(filtered_bam_index), "filtered_mapstats": dxpy.dxlink(filtered_mapstats), "dup_file_qc": dxpy.dxlink(dup_file), "pbc_file_qc": dxpy.dxlink(pbc_file), "paired_end": paired_end, "NRF": pbc_qc.get('NRF'), "PBC1": pbc_qc.get('PBC1'), "PBC2": pbc_qc.get('PBC2'), "duplicate_fraction": dup_qc.get('percent_duplication') } logger.info("Exiting with output:\n%s" % (pprint(output))) return output